get data

library(survival)
library(car)
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()
# g %>% head()
g$gender <- as.factor(g$gender)
g$copd <- as.factor(g$copd)
g$ethnicgroup <- as.factor(g$ethnicgroup)
levels(g$ethnicgroup) <- c(levels(g$ethnicgroup), "8")
g$ethnicgroup[is.na(g$ethnicgroup)] <- "8"
table(g$ethnicgroup, exclude = NULL)

# set quintile 0 to NAs, coxph will ignore these patients
g <- g %>% mutate(quintile = recode(quintile, "c('0')=NA"))
table(g$quintile, exclude = NULL)
g$quintile <- as.factor(g$quintile)
g %>% names()

option1, using apriori variables

  • https://www.ncbi.nlm.nih.gov/books/NBK513479/
  • Objective 1: what are the main patient, primary care and hospital factors associated with variation in readmission and mortality rates?
  • Table 6: Odds ratios with 95% CIs for possible patient, trust and primary care predictors of mortality within 1 year of admission for HF and COPD patients
  • paper used logistic regression with outcome = death within 1 year of follow-up, but interesting to see if significant predictors apply to Cox regression
  • significant predictors = age, gender (male = ref), ethnicgroup (white = ref), ihd, stroke, valvular_disease, pvd, copd, pneumonia, hypertension, cancer, metastatic_cancer, senile, dementia, mental health, prior_dnas, los (0 nights = ref), cabg

  • use age, los and prior_dnas as continuous for this exercise
# check class of each column
g %>% dim() # [1] 1000   31
g %>% str()

# get new df for selected variables
g1 <- g %>% select(c(age, gender, ethnicgroup, ihd, stroke, valvular_disease, pvd, copd, pneumonia, hypertension, cancer, metastatic_cancer, senile, dementia, mental_health, prior_dnas, los, cabg, fu_time))
g1 %>% dim() # [1] 1000   21

# convert all categorical variables to factors
g1 <- g1 %>% mutate_if(~ n_distinct(.) == 2, as.factor)
g1$ethnicgroup <- as.factor(g1$ethnicgroup)

# attach death on its own, since previous mutate_if() affects it
g1$death <- g$death
g1 %>% str()

# run cox model on apriori variables
cox <- coxph(Surv(fu_time, death) ~ ., data = g1)
summary(cox)

option2, include all available predictors

g %>% str()
g2 <- g %>% select(-c(id, death))
g2 <- g2 %>% mutate_if(~n_distinct(.) == 2, as.factor)
g2$death <- g$death
g2 %>% str()
g2$prior_dnas %>% table(., exclude = NULL)
g2$ethnicgroup %>% table(., exclude = NULL)
g2$quintile %>% table(., exclude = NULL)

# only 3 patients had crt=1, model could not converge, remove crt
g2$crt %>% table(., exclude = NULL)
g2 <- g2 %>% select(-crt)

# fit model2 with all available predictors
cox2 <- coxph(Surv(fu_time, death) ~ ., data = g2)
cox2_coef_before <- summary(cox2)$coefficients %>%
  as.data.frame() %>%
  rownames_to_column(., "predictors") %>%
  rename("HR_before" = `exp(coef)`)
cox2_coef_before

option3, backward elimination

# get significant variables
vars_sig <- summary(cox2)$coefficients %>% 
  as.data.frame() %>%
  rownames_to_column(., "predictors") %>% 
  filter(`Pr(>|z|)` < 0.05) %>%
  select(predictors) %>%
  unlist() %>% unname()
vars_sig <- vars_sig %>% gsub('[0-9]+', '', .)

# rerun model with only significant variables
fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(vars_sig, collapse=" + "), sep = ""))
cox2_vars_sig <- coxph(fmla, data = g2)
cox2_coef_after <- summary(cox2_vars_sig)$coefficients %>%
  as.data.frame() %>% 
  rownames_to_column(., "predictors") %>%
  rename("HR_after" = `exp(coef)`)
cox2_coef_after
##            predictors         coef   HR_after    se(coef)            z
## 1                 los  0.013156517  1.0132434 0.003207110  4.102296895
## 2                 age  0.062300175  1.0642818 0.005753519 10.828185358
## 3             gender2 -0.299286306  0.7413471 0.097075407 -3.083029114
## 4           dementia1  0.610253737  1.8408984 0.185367524  3.292128645
## 5                ihd1  0.198290786  1.2193169 0.095267929  2.081401243
## 6   valvular_disease1  0.212028608  1.2361832 0.108826412  1.948319392
## 7  metastatic_cancer1  2.333592101 10.3149273 0.370321509  6.301530010
## 8          prior_dnas  0.126983722  1.1353985 0.039979444  3.176225338
## 9           quintile2 -0.338291025  0.7129878 0.157005856 -2.154639545
## 10          quintile3  0.111668957  1.1181426 0.152809420  0.730772728
## 11          quintile4  0.001467675  1.0014688 0.151909946  0.009661478
## 12          quintile5  0.156418615  1.1693156 0.154448715  1.012754395
## 13       ethnicgroup2 -0.291773211  0.7469379 0.361218378 -0.807747416
## 14       ethnicgroup3 -0.819252318  0.4407611 0.416641231 -1.966325602
## 15       ethnicgroup9  0.427600879  1.5335739 0.364356797  1.173577335
## 16       ethnicgroup8  0.063274526  1.0653193 0.228335104  0.277112564
##        Pr(>|z|)
## 1  4.090689e-05
## 2  2.531218e-27
## 3  2.049051e-03
## 4  9.943212e-04
## 5  3.739719e-02
## 6  5.137676e-02
## 7  2.947217e-10
## 8  1.492050e-03
## 9  3.119005e-02
## 10 4.649180e-01
## 11 9.922914e-01
## 12 3.111775e-01
## 13 4.192360e-01
## 14 4.926101e-02
## 15 2.405644e-01
## 16 7.816937e-01
# compare cox2_before vs cox2_after HRs, see which has > 0.05 difference
inner_join(cox2_coef_before, cox2_coef_after, by = "predictors") %>%
  select(predictors, HR_before, HR_after) %>%
  mutate(HR_diff = HR_after - HR_before) %>%
  filter(abs(HR_diff) > 0.05)
##           predictors HR_before  HR_after    HR_diff
## 1 metastatic_cancer1 8.3178427 10.314927 1.99708467
## 2       ethnicgroup9 1.4651009  1.533574 0.06847298
## 3       ethnicgroup8 0.9816789  1.065319 0.08364040
"          predictors HR_before  HR_after    HR_diff
1 metastatic_cancer1 8.3178427 10.314927 1.99708467
2       ethnicgroup8 0.9816789  1.065319 0.08364040
3       ethnicgroup9 1.4651009  1.533574 0.06847298"
## [1] "          predictors HR_before  HR_after    HR_diff\n1 metastatic_cancer1 8.3178427 10.314927 1.99708467\n2       ethnicgroup8 0.9816789  1.065319 0.08364040\n3       ethnicgroup9 1.4651009  1.533574 0.06847298"
vars_sig
##  [1] "los"               "age"               "gender"           
##  [4] "dementia"          "ihd"               "valvular_disease" 
##  [7] "metastatic_cancer" "prior_dnas"        "quintile"         
## [10] "ethnicgroup"
  • looks like metastatic_cancer and ethnicgroup’s HR changed > 0.05

add non-sig variables back

# get list of non-significant variables to add back
vars_sig
vars_all <- names(g2)
vars_to_addback <- setdiff(vars_all[!vars_all %in% c("fu_time", "death")], vars_sig); vars_to_addback

i = 1
for (i in 1:length(vars_to_addback)){
  bkwd_elim_vars <- c(vars_sig, vars_to_addback[i]); bkwd_elim_vars
  fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(bkwd_elim_vars, collapse=" + "), sep = ""))
  cox2_bkwd_elim_vars <- coxph(fmla, data = g2)
  bkwd_elim_HRs <- summary(cox2_bkwd_elim_vars)$coefficients %>%
    as.data.frame() %>%
    rownames_to_column(., "predictors") %>%
    rename("HR_bkwd_elim" = `exp(coef)`) %>%
    select(predictors, HR_bkwd_elim) %>%
    filter(predictors %in% c("metastatic_cancer1", "ethnicgroup8", "ethnicgroup9"))
  print(paste0("added <", vars_to_addback[i], "> back:"))
  print(paste0("bkwd_elim_HRs are:"))
  print(bkwd_elim_HRs)
  print("")
}
# [1] "added <cancer> back:"
# [1] "bkwd_elim_HRs are:"
#           predictors HR_bkwd_elim
# 1 metastatic_cancer1     8.583697   <--
# 2       ethnicgroup8     1.077854
# 3       ethnicgroup9     1.551762

# [1] "added <renal_disease> back:"
# [1] "bkwd_elim_HRs are:"
#           predictors HR_bkwd_elim
# 1 metastatic_cancer1     9.895061   <--
# 2       ethnicgroup8     1.050404
# 3       ethnicgroup9     1.541174

# [1] "added <arrhythmias> back:"
# [1] "bkwd_elim_HRs are:"
#           predictors HR_bkwd_elim
# 1 metastatic_cancer1    10.566096
# 2       ethnicgroup8     1.027209
# 3       ethnicgroup9     1.478646   <--

# [1] "added <pneumonia> back:"
# [1] "bkwd_elim_HRs are:"
#           predictors HR_bkwd_elim
# 1 metastatic_cancer1    10.623099
# 2       ethnicgroup8     1.021901
# 3       ethnicgroup9     1.473842   <--
  • looks like adding “cancer” variable back brought “metastatic_cancer” to HR=8.58 (original HR=8.32)
    • but ethnicgroup HR’s did not change
  • if bring “arrhythmias” or “pneumonia” back, “ethnicgroup9” HR goes to 1.47 (original HR=1.47)
    • but now “metastatic_cancer” HR=10.62 again
  • so how to deal with this when multiple variable’s HR changes after removing non-significant variables??

  • seems like in final model, adding back “cancer”, “pneumonia”, “arrhythmias” will get both “metastatic_cancer” and “ethnicgroup” HR’s back to original

final model

final_vars <- c(vars_sig, "cancer", "pneumonia", "arrhythmias"); final_vars
##  [1] "los"               "age"               "gender"           
##  [4] "dementia"          "ihd"               "valvular_disease" 
##  [7] "metastatic_cancer" "prior_dnas"        "quintile"         
## [10] "ethnicgroup"       "cancer"            "pneumonia"        
## [13] "arrhythmias"
fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(final_vars, collapse=" + "), sep = ""))
cox2_final_vars <- coxph(fmla, data = g2)
summary(cox2_final_vars)
## Call:
## coxph(formula = fmla, data = g2)
## 
##   n= 990, number of events= 489 
##    (10 observations deleted due to missingness)
## 
##                         coef exp(coef)  se(coef)      z Pr(>|z|)    
## los                 0.012907  1.012990  0.003246  3.977 6.98e-05 ***
## age                 0.061641  1.063580  0.005712 10.791  < 2e-16 ***
## gender2            -0.307789  0.735071  0.098645 -3.120  0.00181 ** 
## dementia1           0.578317  1.783036  0.187069  3.091  0.00199 ** 
## ihd1                0.182621  1.200359  0.095916  1.904  0.05692 .  
## valvular_disease1   0.219619  1.245602  0.108938  2.016  0.04380 *  
## metastatic_cancer1  2.191865  8.951892  0.405582  5.404 6.51e-08 ***
## prior_dnas          0.123649  1.131619  0.040357  3.064  0.00219 ** 
## quintile2          -0.349069  0.705345  0.157885 -2.211  0.02704 *  
## quintile3           0.122582  1.130412  0.153731  0.797  0.42523    
## quintile4          -0.004269  0.995741  0.152280 -0.028  0.97764    
## quintile5           0.146682  1.157986  0.155145  0.945  0.34443    
## ethnicgroup2       -0.309944  0.733488  0.363972 -0.852  0.39446    
## ethnicgroup3       -0.838791  0.432233  0.417241 -2.010  0.04440 *  
## ethnicgroup9        0.361710  1.435782  0.365495  0.990  0.32235    
## ethnicgroup8       -0.002431  0.997572  0.231320 -0.011  0.99161    
## cancer1             0.251735  1.286256  0.208534  1.207  0.22737    
## pneumonia1          0.295212  1.343411  0.140933  2.095  0.03620 *  
## arrhythmias1       -0.178878  0.836208  0.093920 -1.905  0.05683 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                    exp(coef) exp(-coef) lower .95 upper .95
## los                   1.0130     0.9872    1.0066    1.0195
## age                   1.0636     0.9402    1.0517    1.0756
## gender2               0.7351     1.3604    0.6058    0.8919
## dementia1             1.7830     0.5608    1.2357    2.5727
## ihd1                  1.2004     0.8331    0.9946    1.4486
## valvular_disease1     1.2456     0.8028    1.0061    1.5421
## metastatic_cancer1    8.9519     0.1117    4.0428   19.8220
## prior_dnas            1.1316     0.8837    1.0456    1.2248
## quintile2             0.7053     1.4177    0.5176    0.9612
## quintile3             1.1304     0.8846    0.8363    1.5279
## quintile4             0.9957     1.0043    0.7388    1.3420
## quintile5             1.1580     0.8636    0.8544    1.5695
## ethnicgroup2          0.7335     1.3633    0.3594    1.4969
## ethnicgroup3          0.4322     2.3136    0.1908    0.9792
## ethnicgroup9          1.4358     0.6965    0.7014    2.9390
## ethnicgroup8          0.9976     1.0024    0.6339    1.5698
## cancer1               1.2863     0.7775    0.8547    1.9357
## pneumonia1            1.3434     0.7444    1.0192    1.7708
## arrhythmias1          0.8362     1.1959    0.6956    1.0052
## 
## Concordance= 0.715  (se = 0.012 )
## Likelihood ratio test= 243.8  on 19 df,   p=<2e-16
## Wald test            = 231  on 19 df,   p=<2e-16
## Score (logrank) test = 251.9  on 19 df,   p=<2e-16

testing assumptions

1-PH assumption

  • Schoenfeld residuals
p-value
zph <- cox.zph(cox2_final_vars, transform = "km", global = TRUE) 
# defaults to "km" and global = TRUE
print(zph)
##                     chisq df      p
## los                0.0812  1 0.7757
## age                0.3726  1 0.5416
## gender             0.4983  1 0.4803
## dementia           3.8657  1 0.0493
## ihd                9.1344  1 0.0025
## valvular_disease   0.3868  1 0.5340
## metastatic_cancer  0.3979  1 0.5282
## prior_dnas         0.1459  1 0.7024
## quintile           2.1905  4 0.7008
## ethnicgroup        0.3046  4 0.9895
## cancer             3.4334  1 0.0639
## pneumonia          1.8834  1 0.1699
## arrhythmias        0.4609  1 0.4972
## GLOBAL            23.2724 19 0.2256
plot(zph)

library(survminer)
ggcoxzph(zph, var = final_vars,
         font.main = 10, 
         font.x = c(10, "plain", "black"), 
         font.y = c(10, "plain", "black"), 
         font.tickslab = c(8, "plain", "black"))

  • ihd had p-value of only 0.0032, should this var be removed?
    • or should we rely on GLOBAL p-value?

  • how come for some vars, there’s more positive residuals than negatives?
    • e.g. los, dementia, valvular_disease, metastatic_cancer, prior_dnas, cancer
  • only quintile and ethnicgroup had the opposite problem - more negative residuals than positive…
graphically with KM plot
  • only works well for binary vars
km_fit <- survfit(Surv(fu_time, death) ~ gender, data = g2)
ggsurvplot(km_fit, data = g2)

2-outliers

influence of obs on HR (coefficients)
ggcoxdiagnostics(cox2_final_vars, type = "dfbeta",
                 linear.predictions = FALSE,
                 ggtheme = theme_bw())

distribution of deviance residuals
deviance_res <- residuals(cox2_final_vars, type = "deviance")
deviance_res_outside1.96SD <- deviance_res[abs(deviance_res) > 1.96]
deviance_res_outside1.96SD %>% length / deviance_res %>% length
## [1] 0.06262626
# [1] 0.06161616
# not bad, should be < 0.05
ggcoxdiagnostics(cox2_final_vars, type = "deviance",
                 linear.predictions = FALSE,
                 ggtheme = theme_bw())

3-linear relation, continuous var <-> outcome

  • Martingale residuals
# g2 %>% select(final_vars) %>% str()
ggcoxfunctional(Surv(fu_time, death) ~ age + log(age) + sqrt(age), data = g2)
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.

ggcoxfunctional(Surv(fu_time, death) ~ los + sqrt(los), data = g2) # can't log(los), it has some zeros, log(0) = -Inf
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.

ggcoxfunctional(Surv(fu_time, death) ~ prior_dnas + sqrt(prior_dnas), data = g2) # can't log(prior_dnas) neither, same problem
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.

  • why log() and sqrt() ???
    • to see if after log() or sqrt() the relationship is more linear
  • what if you can’t log() cuz there’s 0’s???
  • what counts as linear relationship from these graphs…?

issues encountered

  • choosing vars to include from the paper
  • backward elimination
    • what if multiple significant var HR changed > 0.05? and adding back non-significant var to restore HR restores either of significant var but not all significant vars???
      • A: actually in final model if added back important non-significant vars, will correct their respective significant var’s HRs
    • but why does adding back non-significant vars affect significant var’s HR in the first place???
      • how are these variables “correlated”???
      • I thought we don’t want correlated variables in the same model???
  • choosing final set of predictors
  • testing proportionality assumptions
    • ihd had p-value of only 0.0032, should this var be removed? or should we just rely on GLOBAL p-value?
    • how to interpret graph from Martingale residuals? what counts as linear relationship???
  • why final set of predictors differ from paper
    • {logistic vs survival analysis; different dataset}

EOF