#source raw data directory: data_raw and data included
source("../../../isolation_trajectories_data_path.R")si.quad.all <- readModels(paste0(mplus_GMM_clustered_full_output_QUAD_data_path), recursive = TRUE)#extract the summary variables from the mplus output files. Assuming there are multiple files in the above directory, model summaries could be retained as a data.frame as follows:
si_quad_summaries <- do.call("rbind.fill",
sapply(si.quad.all,
"[",
"summaries"))Here we only reprt trsults for the two and three class models as the four to six class models did not converge.
Significance values for 2 class model: Q has significant mean for class 1 (0.044), although only just. Q has non significant mean for class 2 (0.085).
Significance values for 3 class model: Q has significant mean for class 1 (0.001). Q has significant mean for class 2 (0.002). Q has significant mean for class 3 (0.008).
# two class
quad.mean.two.class <- si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_2traj_full_sample_clustered_QUAD.out$parameters$unstandardized %>%
filter(paramHeader == "Means" & LatentClass < 7) # filter out the means for I, S and Q for each class
quad.variance.two.class <- si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_2traj_full_sample_clustered_QUAD.out$parameters$unstandardized %>%
filter(paramHeader == "Variances" & LatentClass < 7) # filter out the means for I, S and Q for each class
# three class
quad.mean.three.class <- si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_3traj_full_sample_clustered_QUAD.out$parameters$unstandardized %>%
filter(paramHeader == "Means" & LatentClass < 7) # filter out the means for I, S and Q for each class
quad.variance.three.class <- si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_3traj_full_sample_clustered_QUAD.out$parameters$unstandardized %>%
filter(paramHeader == "Variances" & LatentClass < 7) # filter out the means for I, S and Q for each class two.class.mean.var <- rbind(quad.mean.two.class, quad.variance.two.class) %>%
select(`Mean/variance` = paramHeader,
`Intercept/slope/quadratic` = param,
`estimate` = est,
`standard error` = se,
`p value` = pval)
kable(two.class.mean.var)| Mean/variance | Intercept/slope/quadratic | estimate | standard error | p value |
|---|---|---|---|---|
| Means | I | 0.714 | 0.030 | 0.000 |
| Means | S | 0.033 | 0.022 | 0.127 |
| Means | Q | -0.006 | 0.003 | 0.044 |
| Means | I | 1.979 | 0.227 | 0.000 |
| Means | S | -0.087 | 0.254 | 0.732 |
| Means | Q | 0.067 | 0.039 | 0.085 |
| Variances | I | 0.642 | 0.119 | 0.000 |
| Variances | S | 0.081 | 0.029 | 0.006 |
| Variances | Q | 0.001 | 0.001 | 0.013 |
| Variances | I | 0.642 | 0.119 | 0.000 |
| Variances | S | 0.081 | 0.029 | 0.006 |
| Variances | Q | 0.001 | 0.001 | 0.013 |
three.class.mean.var <- rbind(quad.mean.three.class, quad.variance.three.class) %>%
select(`Mean/variance` = paramHeader,
`Intercept/slope/quadratic` = param,
`estimate` = est,
`standard error` = se,
`p value` = pval)
kable(three.class.mean.var)| Mean/variance | Intercept/slope/quadratic | estimate | standard error | p value |
|---|---|---|---|---|
| Means | I | 0.594 | 0.041 | 0.000 |
| Means | S | 0.078 | 0.023 | 0.001 |
| Means | Q | -0.010 | 0.003 | 0.001 |
| Means | I | 1.417 | 0.223 | 0.000 |
| Means | S | -0.142 | 0.207 | 0.493 |
| Means | Q | 0.095 | 0.031 | 0.002 |
| Means | I | 4.274 | 0.638 | 0.000 |
| Means | S | -0.826 | 0.202 | 0.000 |
| Means | Q | 0.069 | 0.026 | 0.008 |
| Variances | I | 0.244 | 0.095 | 0.010 |
| Variances | S | 0.073 | 0.028 | 0.010 |
| Variances | Q | 0.001 | 0.000 | 0.014 |
| Variances | I | 0.244 | 0.095 | 0.010 |
| Variances | S | 0.073 | 0.028 | 0.010 |
| Variances | Q | 0.001 | 0.000 | 0.014 |
| Variances | I | 0.244 | 0.095 | 0.010 |
| Variances | S | 0.073 | 0.028 | 0.010 |
| Variances | Q | 0.001 | 0.000 | 0.014 |
# remove 4,5,6 class models
si.two_three.summaries <- si_quad_summaries[-c(3:5),] #social isolation summaries
si.class.summaries <- data.frame(matrix(nrow = 2,
ncol = 8))
#create column names for the variables you will be adding to the empty matrix of si.class.summaries
colnames(si.class.summaries) <- c("Model",
"Parameters",
"AIC",
"BIC",
"aBIC",
"Entropy",
"VLMR LRT p-value",
"Class Proportions") #or whichever indices you want to compare; do si_summaries$ to see what's in there
#check colnames
#si.class.summaries
#create "Model" variable
si.class.summaries <- si.class.summaries %>%
mutate(
Model =
as.factor(c("Two Class", #change nrow in data frame above when adding more models
"Three Class")))
#add summary information into data frame
si.class.summaries <- si.class.summaries %>%
mutate(
Parameters =
si.two_three.summaries$Parameters) %>% #parameters col from original summaries data set
mutate(
AIC =
si.two_three.summaries$AIC) %>% #AIC col from original summaries data set
mutate(
BIC =
si.two_three.summaries$BIC) %>% #BIC col from original summaries data set
mutate(
aBIC =
si.two_three.summaries$aBIC) %>% #aBIC col from original summaries data set
mutate(
Entropy =
si.two_three.summaries$Entropy) %>% #Entropy col from original summaries data set
mutate(
`VLMR LRT p-value` =
si.two_three.summaries$T11_VLMR_PValue) #T11_VLMR_PValue col from original summaries data set
#check
#si.class.summaries#for each class we want to add in the model estimated class counts - then convert this into a list with the percentage of people in each class
#two classes
si.class.summaries$`Class Proportions`[si.class.summaries$Model == "Two Class"] <- list(c(sprintf("%.0f", si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_2traj_full_sample_clustered_QUAD.out$class_counts$modelEstimated$proportion*100)))
#three classes
si.class.summaries$`Class Proportions`[si.class.summaries$Model == "Three Class"] <- list(c(sprintf('%.0f', si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_3traj_full_sample_clustered_QUAD.out$class_counts$modelEstimated$proportion*100)))#two classes
si.class.summaries$`Class N`[si.class.summaries$Model == "Two Class"] <- list(c(sprintf("%.0f", si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_2traj_full_sample_clustered_QUAD.out$class_counts$posteriorProb$count)))
#three classes
si.class.summaries$`Class N`[si.class.summaries$Model == "Three Class"] <- list(c(sprintf("%.0f", si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_3traj_full_sample_clustered_QUAD.out$class_counts$posteriorProb$count)))#Look at summary table
kable(si.class.summaries)| Model | Parameters | AIC | BIC | aBIC | Entropy | VLMR LRT p-value | Class Proportions | Class N |
|---|---|---|---|---|---|---|---|---|
| Two Class | 17 | 25392.55 | 25489.63 | 25435.62 | 0.950 | 0.0562 | 93, 7 | 2071, 161 |
| Three Class | 21 | 24762.68 | 24882.60 | 24815.88 | 0.956 | 0.2802 | 90, 5 , 5 | 2014, 114 , 104 |
#create two class data frame with means and variances
two_class_si <- as.data.frame(
si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_2traj_full_sample_clustered_QUAD.out$gh5$means_and_variances_data$y_estimated_means)
#name the variables
names(two_class_si) <- c("Class 1","Class 2")
#create time point column
two_class_si$timepoint <- c(5, 7, 10, 12)
#convert data to long format
two_class_si <- two_class_si %>%
pivot_longer(-timepoint,
names_to = "Class",
values_to = "social.isolation_score")
#check
#two_class_sitwo_traj_plot_clustered_full_sample <- ggplot(two_class_si,
aes(x = timepoint,
y = social.isolation_score,
colour = Class)) +
geom_line(size = 1.5) +
geom_point(aes(#shape = class,
size = 1)) +
scale_fill_manual(values = palette2) +
scale_color_manual(values = palette2) +
labs(x = "Age (in years)",
title = "Two class model",
color = "Class",
y ="Social isolation score") +
scale_y_continuous(expand = c(0.1,0.1),
limits = c(0,12),
breaks = seq(0, 12, 2)) +
scale_x_continuous(expand = c(0.1,0.1),
limits = c(5,12),
breaks = c(5, 7, 10, 12)) +
theme(panel.grid.major.y = element_line(size = 0.5,
linetype = 'dashed',
colour = "gray"),
axis.text = element_text(colour="black",
size = 12),
axis.title = element_text(colour="black",
size = 12),
panel.background = element_blank(),
plot.title = element_text(size = 16, hjust = 0.5)) +
guides(shape = FALSE, size = FALSE)
two_traj_plot_clustered_full_sample
#create three class data frame with means and variances
three_class_si <- as.data.frame(
si.quad.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.quadratic..isolation_3traj_full_sample_clustered_QUAD.out$gh5$means_and_variances_data$y_estimated_means)
#name the variables
names(three_class_si) <- c("Class 1","Class 2","Class 3")
#create time point column
three_class_si$timepoint <- c(5, 7, 10, 12)
#convert data to long format
three_class_si <- three_class_si %>%
pivot_longer(-timepoint,
names_to = "Class",
values_to = "social.isolation_score")
#check
#three_class_sithree_traj_plot_clustered_full_sample <- ggplot(three_class_si,
aes(x = timepoint,
y = social.isolation_score,
colour = Class)) +
geom_line(size = 1.5) +
geom_point(aes(size = 1)) +
scale_fill_manual(values = palette3) +
scale_color_manual(values = palette3) +
labs(x = "Age (in years)",
title = "Three class model",
color = "Class",
y ="Social isolation score") +
scale_y_continuous(expand = c(0.1,0.1),
limits = c(0,12),
breaks = seq(0, 12, 2)) +
scale_x_continuous(expand = c(0.1,0.1),
limits = c(5,12),
breaks = c(5, 7, 10, 12)) +
theme(panel.grid.major.y = element_line(size = 0.5,
linetype = 'dashed',
colour = "gray"),
axis.text = element_text(colour="black",
size = 12),
axis.title = element_text(colour="black",
size = 12),
panel.background = element_blank(),
plot.title = element_text(size = 16, hjust=0.5)) +
guides(shape = FALSE, size = FALSE)
three_traj_plot_clustered_full_sample
dat.descriptives <- readRDS(file = paste0(data_path, "preprocessed_isolation_trajectories_Jan2021_full_sample.rds"))
#colnames(dat.descriptives)dat.tra.prob.full.LINEAR <- read.table(paste0(mplus_GMM_clustered_full_output_data_path, "GMM_SI_3Cl_full_sample_clustered.txt"),
header = FALSE,
col.names = c("SISOE5", # original social isolation score at age 5
"SISOE7", # original social isolation score at age 7
"SISOE10", # original social isolation score at age 10
"SISOE12", # original social isolation score at age 12
"I", # intercept
"S", # slope
"C_I", # factor scores based on most likely class membership
"C_S", # factor scores based on most likely class membership
"CPROB1", # class probability 1
"CPROB2", # class probability 2
"CPROB3", # class probability 3
"C", # class
"ID", # ID
"FAMILYID" # Family ID - twin clustering
))
#check
#dat.tra.prob.full.LINEARdat.tra.prob.full.QUAD <- read.table(paste0(mplus_GMM_clustered_full_output_QUAD_data_path, "GMM_SI_3Cl_full_sample_clustered_QUAD.txt"),
header = FALSE,
col.names = c("SISOE5", # original social isolation score at age 5
"SISOE7", # original social isolation score at age 7
"SISOE10", # original social isolation score at age 10
"SISOE12", # original social isolation score at age 12
"I", # intercept
"S", # slope
"Q", # quadratic
"C_I", # factor scores based on most likely class membership
"C_S", # factor scores based on most likely class membership
"C_Q", # factor scores based on most likely class membership
"CPROB1", # class probability 1
"CPROB2", # class probability 2
"CPROB3", # class probability 3
"C", # class
"ID", # ID
"FAMILYID" # Family ID - twin clustering
))
#check
#dat.tra.prob.full.QUADdat.tra.prob.LINEAR <- dat.tra.prob.full.LINEAR %>%
select(id = ID,
prob1.linear = CPROB1,
prob2.linear = CPROB2,
prob3.linear = CPROB3,
class.linear = C
)
#check
#dat.tra.prob.LINEAR
dat.tra.prob.QUAD <- dat.tra.prob.full.QUAD %>%
select(id = ID,
prob1.quad = CPROB1,
prob2.quad = CPROB2,
prob3.quad = CPROB3,
class.quad = C
)
#check
#dat.tra.prob.QUAD
#create list of data frames to merge
dataframe_list <- list(
dat.descriptives,
dat.tra.prob.LINEAR,
dat.tra.prob.QUAD
)
#merge data frames
dat <- plyr::join_all(
dataframe_list,
by = "id" # Alternatively you can join by several columns
)
#check
#colnames(dat)# linear
dat <- dat %>%
mutate(class_renamed.linear =
recode_factor(class.linear,
"1" = "Low stable",
"2" = "Increasing",
"3" = "Decreasing"
)
)
as.data.frame(table(dat$class_renamed.linear)) %>%
select(`Linear classes` = Var1,
Freq)# quadratic
dat <- dat %>%
mutate(class_renamed.quad =
recode_factor(class.quad,
"1" = "Low stable",
"2" = "Increasing",
"3" = "Decreasing"
)
)
as.data.frame(table(dat$class_renamed.quad)) %>%
select(`Quadratic classes` = Var1,
Freq)96.51% of people were classified the same way regardless if the model was linear or quadratic.
Overall average movement from Linear to Quadratic:
dat <- dat %>%
mutate(
matching.class =
case_when(
class.linear == class.quad ~ "Matching",
class.linear != class.quad ~ "Not matching"
)
)
freq(dat$matching.class,
cumul = FALSE,
display.type = FALSE,
headings = FALSE,
style = "rmarkdown",
report.nas = FALSE)| Freq | % | |
|---|---|---|
| Matching | 2154 | 96.51 |
| Not matching | 78 | 3.49 |
| Total | 2232 | 100.00 |
dat <- dat %>%
mutate(matching.isolated =
case_when(
matching.class == "Matching" & class_renamed.linear == "Increasing" ~ "Matching and increasing",
matching.class == "Matching" & class_renamed.linear == "Decreasing" ~ "Matching and decreasing",
matching.class == "Matching" & class_renamed.linear == "Low stable" ~ "Matching and low stable"
))
freq(dat$matching.isolated,
cumul = FALSE,
display.type = FALSE,
headings = FALSE,
style = "rmarkdown",
report.nas = FALSE)| Freq | % | |
|---|---|---|
| Matching and decreasing | 88 | 4.09 |
| Matching and increasing | 87 | 4.04 |
| Matching and low stable | 1979 | 91.88 |
| Total | 2154 | 100.00 |
dat.non.match <- dat %>%
filter(matching.class == "Not matching")# from linear to quadratic model
dat.non.match <- dat.non.match %>%
mutate(
direction.match =
case_when(
class_renamed.linear == "Increasing" & class_renamed.quad == "Low stable" ~ "Increasing to low stable",
class_renamed.linear == "Decreasing" & class_renamed.quad == "Low stable" ~ "Decreasing to low stable",
class_renamed.linear == "Increasing" & class_renamed.quad == "Decreasing" ~ "Increasing to decreasing",
class_renamed.linear == "Decreasing" & class_renamed.quad == "Increasing" ~ "Decreasing to increasing",
class_renamed.linear == "Low stable" & class_renamed.quad == "Increasing" ~ "Low stable to increasing",
class_renamed.linear == "Low stable" & class_renamed.quad == "Decreasing" ~ "Low stable to decreasing",
)
)
freq(dat.non.match$direction.match,
cumul = FALSE,
display.type = FALSE,
headings = FALSE,
style = "rmarkdown",
report.nas = FALSE)| Freq | % | |
|---|---|---|
| Decreasing to increasing | 1 | 1.28 |
| Decreasing to low stable | 28 | 35.90 |
| Increasing to low stable | 19 | 24.36 |
| Low stable to decreasing | 11 | 14.10 |
| Low stable to increasing | 19 | 24.36 |
| Total | 78 | 100.00 |
Work by Katherine N Thompson
katherine.n.thompson@kcl.ac.uk