This page contains the code to combine the original Rmd dataset and the new Mplus output. We also include relevant descriptive statistics for the classes/trajectories. We have used the hard classification variable given by Mplus for our analyses. We conduct sensitivity analyses using the 3STEP approach, data is prepped for this at the end of this page. Please note, if you are running this analysis for yourself, you will need to do this step after running your Mplus models and before analysing the data in Rmd


Combine Mplus and Rmd data

We have read in the processed R data and the Mplus model information to show who is has identified into which class.

dat.descriptives <- readRDS(file = paste0(data_path, "preprocessed_isolation_trajectories_Jan2021_full_sample.rds"))

The Mplus output file contains the raw original data, slope estimate, intercept estimate, factor scores based on the slope and intercept, class probabilities for each person per class, and the hard class classification.

dat.tra.prob.full <- read.table(paste0(mplus_GMM_clustered_full_output_data_path, "GMM_SI_3Cl_full_sample_clustered.txt"), 
                            header = FALSE, 
                            col.names = c("SISOE5", # original social isolation score at age 5
                                         "SISOE7",  # original social isolation score at age 7
                                         "SISOE10", # original social isolation score at age 10
                                         "SISOE12", # original social isolation score at age 12
                                         "I",       # intercept
                                         "S",       # slope
                                         "C_I",     # factor scores based on most likely class membership
                                         "C_S",     # factor scores based on most likely class membership
                                         "CPROB1",  # class probability 1
                                         "CPROB2",  # class probability 2
                                         "CPROB3",  # class probability 3
                                         "C",       # class 
                                         "ID",      # ID
                                         "FAMILYID" # Family ID - twin clustering  
                                       ))   
#check 
head(dat.tra.prob.full)

We will only need the classification probabilities and the hard classification variable. This is then combined with the rest of our Rmd data.

dat.tra.prob <- dat.tra.prob.full %>%
  select(id = ID, 
         prob1 = CPROB1,
         prob2 = CPROB2,
         prob3 = CPROB3,
         class = C
         )
#check 
# dat.tra.prob

#create list of data frames to merge
dataframe_list <- list(
  dat.descriptives,
  dat.tra.prob
)

#merge data frames
dat <- plyr::join_all(
  dataframe_list,
  by = "id" # Alternatively you can join by several columns
  )

#check
# colnames(dat)

The class variable is then renamed to reflect the pattern seen in the classes identified. Here, we name them “Low stable”, “Increasing”, and “Decreasing” to reflect the pattern of each of the linear curves.

dat <- dat %>%
  mutate(class_renamed =
    recode_factor(class,
      "1" = "Low stable",
      "2" = "Increasing",
      "3" = "Decreasing"
    )
  )

data.frame(table(dat$class_renamed))

Spaghetti plot showing all social isolation trajectories over time

combined_social_isolation <- c("isolation_combined_05",
                              "isolation_combined_07",
                              "isolation_combined_10",
                              "isolation_combined_12")

#convert to long format 
dat_long <- dat %>%
  gather(
    key = "time_point_raw",
    value = "social_isolation",
      all_of(combined_social_isolation)) %>%
  select(
    id,
    sex,
    time_point_raw,
    social_isolation,
    class_renamed)

#head(dat_long)

#create variable that has just the number for the time point
dat_long <- dat_long %>%
  mutate(time_point =
  recode_factor(time_point_raw, 
                "isolation_combined_05" = "5", 
                "isolation_combined_07" = "7", 
                "isolation_combined_10" = "10", 
                "isolation_combined_12" = "12")) %>%
  select(
    id,
    sex,
    time_point,
    social_isolation,
    class_renamed)

#head(dat_long)
spaghetti_plot <- ggplot(data = dat_long,
       aes(x = time_point,
           y = social_isolation,
           group = factor(id),
           color = class_renamed)) +
  geom_line() +
  scale_color_manual(name = "Class", values = palette3) +
  labs(y = "Social isolation score",
       x = "Age (years)") +
  scale_x_discrete(expand = c(0.05,0.05), breaks = c("5", "7", "10", "12")) +
  scale_y_continuous(expand = c(0.05,0.05), breaks = c(0,2,4,6,8,10,12)) +
  theme(
        panel.grid.major.y = element_line(size = 0.2,linetype = 'dashed',colour = "gray"),
        axis.title.x = element_text(size = 12,face="bold"),
        axis.title.y = element_text(size = 12,face="bold"),
        axis.text.x = element_text(colour = "black", size = 10),
        axis.text.y = element_text(colour = "black", size = 10),
        panel.background = element_blank(),
        legend.position = "bottom")

The trajectory of each participant is plotted separately and colour coded depending on which class they were classified. Decreasing is indicated in pink, Increasing in Orange, and Low stable in blue. The average class trajectories are then plotted over the top.

#For class models, read in all output files within your folder that you have all your class models
si.classes.all <- readModels(paste0(mplus_GMM_clustered_full_output_data_path), recursive = TRUE) #Recursive means it reads data in sub-folders too
#create three class data frame with means and variances
three_class_si <- as.data.frame(
  si.classes.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.output..isolation_3traj_full_sample_clustered.out$gh5$means_and_variances_data$y_estimated_means)

#name the variables
names(three_class_si) <- c("Class 1","Class 2","Class 3")

#create time point column 
three_class_si$time_point <- factor(c("5","7","10","12"))

#convert data to long format 
three_class_si <- three_class_si %>%
  pivot_longer(-time_point, 
               names_to = "class_renamed", 
               values_to = "social_isolation")

# create factor for Class variable
three_class_si <- three_class_si %>%
  mutate(class_renamed = 
              factor(class_renamed, 
                     levels = c("Class 1","Class 2","Class 3"))) 
spaghetti_plotB <- ggplot(data = dat_long,
                         aes(x = time_point,
                             y = social_isolation,
                             color = class_renamed)) +
                    geom_line(aes(group = factor(id)), alpha = 0.9, size = 0.5) +
                    geom_line(data = three_class_si, aes(group = class_renamed), size = 1, alpha = 1) +
                    scale_color_manual(name = "class_renamed", values = palette_spag) +
                    labs(y = "Social isolation score",
                         x = "Age (years)") +
                    scale_x_discrete(expand = c(0.05,0.05), breaks = c("5", "7", "10", "12")) +
                    scale_y_continuous(expand = c(0.05,0.05), breaks = c(0,2,4,6,8,10,12)) +
                    theme(
                          panel.grid.major.y = element_line(size = 0.2,linetype = 'dashed',colour = "gray"),
                          axis.title.x = element_text(size = 12,face="bold"),
                          axis.title.y = element_text(size = 12,face="bold"),
                          axis.text.x = element_text(colour = "black", size = 10),
                          axis.text.y = element_text(colour = "black", size = 10),
                          panel.background = element_blank(),
                          legend.position = "none") 
spaghetti_plotB

# ggsave(
#   "spaghetti_plot_three_class_colour_coded.jpeg",
#   plot = spaghetti_plotB,
#   device = "jpeg",
#   path = paste0(graph_save_data_path, "trajectories/"),
#   width = 7,
#   height = 5
# )

Descriptives by class

The code to create these descriptives is long, so has been hidden here, please see raw code on Github if interested. The descriptives include ethnicity, SES, zygosity, and sex.

#final descriptives by class table
kable(class.descriptives)
Category Descriptive Low stable % Low stable Increasing % Increasing Decreasing % Decreasing Total % Total
Sex Male 961 47.83 62 58.49 69 58.97 1092 48.92
Female 1048 52.17 44 41.51 48 41.03 1140 51.08
Zygosity MZ 1133 56.40 55 51.89 54 46.15 1242 55.65
DZ 876 43.60 51 48.11 63 53.85 990 44.35
SES Low 630 31.36 57 53.77 55 47.01 742 33.24
Middle 681 33.90 27 25.47 30 25.64 738 33.06
High 698 34.74 22 20.75 32 27.35 752 33.69
Ethnicity White 1818 90.49 102 96.23 98 83.76 2018 90.41
Asian 78 3.88 2 1.89 10 8.55 90 4.03
Black 36 1.79 0 0.00 6 5.13 42 1.88
Mixed race 7 0.35 0 0.00 1 0.85 8 0.36
Other 70 3.48 2 1.89 2 1.71 74 3.32

Adapt variables for regression analysis

We need to relevel the class variable to use the Low Stable class as the reference group in all subsequent regression analyses.

library(nnet)
# relevel class_factor
dat <- dat %>% 
  mutate(
    class_reordered =
      relevel(
        class_renamed,
        ref = "Low stable",
        first = TRUE, #levels in ref come first
        collapse = "+", #String used when constructing names for combined factor levels
        xlevels = TRUE #levels maintained even if not actually occurring
        
      )
  )

Class statistics

Next, we calculate the classification probabilities, class means and NAs per class.

Classification probabilities

For the diagonal, this represents the probability given that the an observation is a part of a specific latent class that you will classify it as said class.

Note this does not represent average probabilities as this table wasn’t available in Mplus version 8.

# extract the classification probabilities for the most likely class memberships (column) by latent class (row). This shows the uncertainty rate of the probabilities. If it was perfect - the diagonal would be 1. E.g. for Class 1 - 98.3% of individuals fit that category. 
class.probs <- data.frame(class.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.output..isolation_3traj_full_sample_clustered.out$class_counts$classificationProbs.mostLikely)

# add column names
colnames(class.probs) <- c("Low stable","Increasing","Decreasing")

class.probs$Class <- c("Low stable","Increasing","Decreasing")

# as.tibble(class.probs)

class.probs <- class.probs %>%
  tidyr::pivot_longer(-Class, 
               names_to = "class", 
               values_to = "Classification probabilities")

class.probs <- class.probs[-c(2:4,6:8),-c(2)]
kable(class.probs)
Class Classification probabilities
Low stable 0.989
Increasing 0.871
Decreasing 0.899

Class means

# extract means for each class 
class.1.means <- as.tibble(class.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.output..isolation_3traj_full_sample_clustered.out$tech7$CLASS.1$classSampMeans)

class.2.means <- as.tibble(class.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.output..isolation_3traj_full_sample_clustered.out$tech7$CLASS.2$classSampMeans)

class.3.means <- as.tibble(class.all$X.Users.katiethompson.Documents.PhD.LISS.DTP_Louise_and_Tim.Social.isolation.trajectories_Paper.1.data_analysis.mplus.GMM.clustered.full_sample.output..isolation_3traj_full_sample_clustered.out$tech7$CLASS.3$classSampMeans)

class.means <- rbind(class.1.means,class.2.means,class.3.means)

class.means$Class <- c("Low stable","Increasing","Decreasing")

class.means <- class.means %>%
  select(
    Class,
    `Mean at age 5` = SISOE5,
    `Mean at age 7` = SISOE7,
    `Mean at age 10` = SISOE10,
    `Mean at age 12` = SISOE12
  )
kable(class.means)
Class Mean at age 5 Mean at age 7 Mean at age 10 Mean at age 12
Low stable 0.605 0.632 0.718 0.673
Increasing 1.291 1.848 3.545 5.027
Decreasing 3.984 3.374 2.347 1.771

Na per person split by class

ctable(dat$class_renamed, dat$na.per.person.si)

Cross-Tabulation, Row Proportions
dat\(class_renamed * dat\)na.per.person.si

dat$na.per.person.si 0 1 2 3 Total
dat$class_renamed
Low stable 1874 (93.3%) 79 (3.9%) 39 (1.9%) 17 (0.8%) 2009 (100.0%)
Increasing 97 (91.5%) 6 (5.7%) 3 (2.8%) 0 (0.0%) 106 (100.0%)
Decreasing 108 (92.3%) 4 (3.4%) 4 (3.4%) 1 (0.9%) 117 (100.0%)
Total 2079 (93.1%) 89 (4.0%) 46 (2.1%) 18 (0.8%) 2232 (100.0%)

Export data

Main analyses

# save R data file
saveRDS(object = dat, file = paste0(data_path, "class_joined_preprocessed_isolation_data_full_sample.rds"))

3STEP method sensitivity analysis

This 3STEP method that will be conducted in Mplus uses the ID variable, the class membership variable.

Remember to manually delete the headers in the csv file and replace NA to . so that Mplus can read the file

Will need to combine the raw data file (to get numeric values and short names to the class statistics)

library(foreign)
dat.raw <- read.spss(file = paste0(data.raw_path, "Katie_18Dec20.sav"),
                  use.value.labels = FALSE, #convert value labels into factors with those labels
                  to.data.frame = TRUE) #return data frame 

#colnames(dat.raw) # check colnames

dat.tra.prob.3step <- dat.tra.prob %>%
  select(atwinid = id,
         prob1,
         prob2,
         prob3,
         class)

#colnames(dat.tra.prob.3step) # check colnames

#create list of data frames to merge
dataframe_list <- list(
  dat.raw,
  dat.tra.prob.3step
)

#merge data frames
dat.3step <- plyr::join_all(
  dataframe_list,
  by = "atwinid" # Alternatively you can join by several columns
  )

#check
#colnames(dat.3step)

Variable REcode based on analysis decisions

Variables that were created in the preprocessing script need to be recomputed here for the 3STEP sensitivity analysis.

ACORN recode

Moderate Means and Hard Pressed were combined, Wealthy Achievers, Urban Prosperity, and Comfortably Off were combined.

dat.3step <- dat.3step %>%
  mutate(P5CACORNCategoryrecoded =
           if_else(
             P5CACORNCategory == 4 | #Moderate Means and Hard Pressed were combined, Wealthy Achievers, Urban Prosperity, and Comfortably Off were combined
               P5CACORNCategory == 5,
             1, # Deprived
             0  # Relatively affluent
           ))
#table(dat.3step$P5CACORNCategoryrecoded)

Harm recode

Possible harm and Definite harm were combined to give a binary variable of Harm VS No harm.

Highest education recode

GSCE grades were collapsed: No qualification and Level 1 (GCSE at grades D-G) were combined. Level 2 (GCSE at grades A*-C) and Level 3 (A Level) were combined.

Parental antisocial bhaviour and alcoholism

We combined maternal and paternal reports to form parental report for antisocial behaviour and alcoholism.

# create total score for mum and data to make parent
dat.3step <- dat.3step %>%
  mutate(antisocialparent =
           asbmm5 + asbfm5) # mum plus dad scores
# create total score for mum and data to make parent
dat.3step <- dat.3step %>%
  mutate(alcoholismparent =
           alcmm5 + alcfm5) # mum plus dad scores

Check column names and export the data

colnames(dat.3step)

[1] “familyid” “atwinid”
[3] “btwinid” “rorderp5”
[5] “torder” “risks”
[7] “cohort” “sampsex”
[9] “zygosity” “sethnic”
[11] “seswq35” “P5CACORNCategory”
[13] “nchildren00e5” “schmeals00e5”
[15] “classsize00e5” “vndngdm5”
[17] “socprbm5” “sisoe5”
[19] “sisoy5” “sisoet5”
[21] “sisoyt5” “sisoem5”
[23] “sisoym5” “harm3em5”
[25] “warme5” “totproe5”
[27] “iqe5” “exfunce5”
[29] “tomtote5” “totexte5”
[31] “intisoe5” “totadde5”
[33] “irre5” “impe5”
[35] “appe5” “slue5”
[37] “ware5” “unce5”
[39] “inhe5” “shye5”
[41] “nmovel5” “nobiodl5”
[43] “tsibl5” “anyviom5”
[45] “tssupm5” “actvm5”
[47] “fdepmm5” “bfiom5”
[49] “bficm5” “bfiem5”
[51] “bfiam5” “bfinm5”
[53] “asbmm5” “asbfm5”
[55] “alcmm5” “alcfm5”
[57] “sisoe7” “sisoy7”
[59] “sisoet7” “sisoyt7”
[61] “sisoem7” “sisoym7”
[63] “sisoe10” “sisoy10”
[65] “sisoet10” “sisoyt10”
[67] “sisoem10” “sisoym10”
[69] “sisoe12” “sisoy12”
[71] “sisoet12” “sisoyt12”
[73] “sisoem12” “sisoym12”
[75] “dxmdee18” “mdesxe18”
[77] “dxgade18” “gadsxe18”
[79] “dxadhd5_18e” “SR_symtot18e”
[81] “cdmode18” “cdsxe18”
[83] “dxalcdepe18” “dxalcabue18”
[85] “alcsxe18” “dxmarje18”
[87] “marjsxe18” “dxptsd5lfe18”
[89] “dxptsd5cue18” “psysympe18”
[91] “psysymp01e18” “psyexpe18”
[93] “psyexpce18” “sharmsuice18”
[95] “bmie18” “phyacte18”
[97] “smkcnume18” “lnCRP_E18_4SD”
[99] “lonelye18” “lifsate18”
[101] “teche18” “psqie18”
[103] “neete18” “educachve18”
[105] “jprepse18” “jprepae18”
[107] “jbschacte18” “optime18”
[109] “srvusemhe18” “copstrse18”
[111] “prob1” “prob2”
[113] “prob3” “class”
[115] “P5CACORNCategoryrecoded” “harm3em5recoded”
[117] “educachve18recoded” “antisocialparent”
[119] “alcoholismparent”

# save csv file 
write_csv(x = dat.3step, path = paste0(data_path, "FOR_MPLUS_preprocessed_isolation_trajectories_Sep2021_3STEP.csv")) # save data in data file
write_csv(x = dat.3step, path = "/Users/katiethompson/Desktop/FOR_MPLUS_preprocessed_isolation_trajectories_Sep2021_3STEP.csv") # save data on desktop for Mplus 
 

Work by Katherine N Thompson

katherine.n.thompson@kcl.ac.uk