get data
library(survival)
library(car)
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()
# g %>% head()
g$gender <- as.factor(g$gender)
g$copd <- as.factor(g$copd)
g$ethnicgroup <- as.factor(g$ethnicgroup)
levels(g$ethnicgroup) <- c(levels(g$ethnicgroup), "8")
g$ethnicgroup[is.na(g$ethnicgroup)] <- "8"
table(g$ethnicgroup, exclude = NULL)
# set quintile 0 to NAs, coxph will ignore these patients
g <- g %>% mutate(quintile = recode(quintile, "c('0')=NA"))
table(g$quintile, exclude = NULL)
g$quintile <- as.factor(g$quintile)
g %>% names()
option1, using apriori variables
- https://www.ncbi.nlm.nih.gov/books/NBK513479/
- Objective 1: what are the main patient, primary care and hospital factors associated with variation in readmission and mortality rates?
- Table 6: Odds ratios with 95% CIs for possible patient, trust and primary care predictors of mortality within 1 year of admission for HF and COPD patients
- paper used logistic regression with outcome = death within 1 year of follow-up, but interesting to see if significant predictors apply to Cox regression
- significant predictors = age, gender (male = ref), ethnicgroup (white = ref), ihd, stroke, valvular_disease, pvd, copd, pneumonia, hypertension, cancer, metastatic_cancer, senile, dementia, mental health, prior_dnas, los (0 nights = ref), cabg
- use age, los and prior_dnas as continuous for this exercise
# check class of each column
g %>% dim() # [1] 1000 31
g %>% str()
# get new df for selected variables
g1 <- g %>% select(c(age, gender, ethnicgroup, ihd, stroke, valvular_disease, pvd, copd, pneumonia, hypertension, cancer, metastatic_cancer, senile, dementia, mental_health, prior_dnas, los, cabg, fu_time))
g1 %>% dim() # [1] 1000 21
# convert all categorical variables to factors
g1 <- g1 %>% mutate_if(~ n_distinct(.) == 2, as.factor)
g1$ethnicgroup <- as.factor(g1$ethnicgroup)
# attach death on its own, since previous mutate_if() affects it
g1$death <- g$death
g1 %>% str()
# run cox model on apriori variables
cox <- coxph(Surv(fu_time, death) ~ ., data = g1)
summary(cox)
option2, include all available predictors
g %>% str()
g2 <- g %>% select(-c(id, death))
g2 <- g2 %>% mutate_if(~n_distinct(.) == 2, as.factor)
g2$death <- g$death
g2 %>% str()
g2$prior_dnas %>% table(., exclude = NULL)
g2$ethnicgroup %>% table(., exclude = NULL)
g2$quintile %>% table(., exclude = NULL)
# only 3 patients had crt=1, model could not converge, remove crt
g2$crt %>% table(., exclude = NULL)
g2 <- g2 %>% select(-crt)
# fit model2 with all available predictors
cox2 <- coxph(Surv(fu_time, death) ~ ., data = g2)
cox2_coef_before <- summary(cox2)$coefficients %>%
as.data.frame() %>%
rownames_to_column(., "predictors") %>%
rename("HR_before" = `exp(coef)`)
cox2_coef_before
option3, backward elimination
# get significant variables
vars_sig <- summary(cox2)$coefficients %>%
as.data.frame() %>%
rownames_to_column(., "predictors") %>%
filter(`Pr(>|z|)` < 0.05) %>%
select(predictors) %>%
unlist() %>% unname()
vars_sig <- vars_sig %>% gsub('[0-9]+', '', .)
# rerun model with only significant variables
fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(vars_sig, collapse=" + "), sep = ""))
cox2_vars_sig <- coxph(fmla, data = g2)
cox2_coef_after <- summary(cox2_vars_sig)$coefficients %>%
as.data.frame() %>%
rownames_to_column(., "predictors") %>%
rename("HR_after" = `exp(coef)`)
cox2_coef_after
## predictors coef HR_after se(coef) z
## 1 los 0.013156517 1.0132434 0.003207110 4.102296895
## 2 age 0.062300175 1.0642818 0.005753519 10.828185358
## 3 gender2 -0.299286306 0.7413471 0.097075407 -3.083029114
## 4 dementia1 0.610253737 1.8408984 0.185367524 3.292128645
## 5 ihd1 0.198290786 1.2193169 0.095267929 2.081401243
## 6 valvular_disease1 0.212028608 1.2361832 0.108826412 1.948319392
## 7 metastatic_cancer1 2.333592101 10.3149273 0.370321509 6.301530010
## 8 prior_dnas 0.126983722 1.1353985 0.039979444 3.176225338
## 9 quintile2 -0.338291025 0.7129878 0.157005856 -2.154639545
## 10 quintile3 0.111668957 1.1181426 0.152809420 0.730772728
## 11 quintile4 0.001467675 1.0014688 0.151909946 0.009661478
## 12 quintile5 0.156418615 1.1693156 0.154448715 1.012754395
## 13 ethnicgroup2 -0.291773211 0.7469379 0.361218378 -0.807747416
## 14 ethnicgroup3 -0.819252318 0.4407611 0.416641231 -1.966325602
## 15 ethnicgroup9 0.427600879 1.5335739 0.364356797 1.173577335
## 16 ethnicgroup8 0.063274526 1.0653193 0.228335104 0.277112564
## Pr(>|z|)
## 1 4.090689e-05
## 2 2.531218e-27
## 3 2.049051e-03
## 4 9.943212e-04
## 5 3.739719e-02
## 6 5.137676e-02
## 7 2.947217e-10
## 8 1.492050e-03
## 9 3.119005e-02
## 10 4.649180e-01
## 11 9.922914e-01
## 12 3.111775e-01
## 13 4.192360e-01
## 14 4.926101e-02
## 15 2.405644e-01
## 16 7.816937e-01
# compare cox2_before vs cox2_after HRs, see which has > 0.05 difference
inner_join(cox2_coef_before, cox2_coef_after, by = "predictors") %>%
select(predictors, HR_before, HR_after) %>%
mutate(HR_diff = HR_after - HR_before) %>%
filter(abs(HR_diff) > 0.05)
## predictors HR_before HR_after HR_diff
## 1 metastatic_cancer1 8.3178427 10.314927 1.99708467
## 2 ethnicgroup9 1.4651009 1.533574 0.06847298
## 3 ethnicgroup8 0.9816789 1.065319 0.08364040
" predictors HR_before HR_after HR_diff
1 metastatic_cancer1 8.3178427 10.314927 1.99708467
2 ethnicgroup8 0.9816789 1.065319 0.08364040
3 ethnicgroup9 1.4651009 1.533574 0.06847298"
## [1] " predictors HR_before HR_after HR_diff\n1 metastatic_cancer1 8.3178427 10.314927 1.99708467\n2 ethnicgroup8 0.9816789 1.065319 0.08364040\n3 ethnicgroup9 1.4651009 1.533574 0.06847298"
vars_sig
## [1] "los" "age" "gender"
## [4] "dementia" "ihd" "valvular_disease"
## [7] "metastatic_cancer" "prior_dnas" "quintile"
## [10] "ethnicgroup"
- looks like metastatic_cancer and ethnicgroup’s HR changed > 0.05
add non-sig variables back
# get list of non-significant variables to add back
vars_sig
vars_all <- names(g2)
vars_to_addback <- setdiff(vars_all[!vars_all %in% c("fu_time", "death")], vars_sig); vars_to_addback
i = 1
for (i in 1:length(vars_to_addback)){
bkwd_elim_vars <- c(vars_sig, vars_to_addback[i]); bkwd_elim_vars
fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(bkwd_elim_vars, collapse=" + "), sep = ""))
cox2_bkwd_elim_vars <- coxph(fmla, data = g2)
bkwd_elim_HRs <- summary(cox2_bkwd_elim_vars)$coefficients %>%
as.data.frame() %>%
rownames_to_column(., "predictors") %>%
rename("HR_bkwd_elim" = `exp(coef)`) %>%
select(predictors, HR_bkwd_elim) %>%
filter(predictors %in% c("metastatic_cancer1", "ethnicgroup8", "ethnicgroup9"))
print(paste0("added <", vars_to_addback[i], "> back:"))
print(paste0("bkwd_elim_HRs are:"))
print(bkwd_elim_HRs)
print("")
}
# [1] "added <cancer> back:"
# [1] "bkwd_elim_HRs are:"
# predictors HR_bkwd_elim
# 1 metastatic_cancer1 8.583697 <--
# 2 ethnicgroup8 1.077854
# 3 ethnicgroup9 1.551762
# [1] "added <renal_disease> back:"
# [1] "bkwd_elim_HRs are:"
# predictors HR_bkwd_elim
# 1 metastatic_cancer1 9.895061 <--
# 2 ethnicgroup8 1.050404
# 3 ethnicgroup9 1.541174
# [1] "added <arrhythmias> back:"
# [1] "bkwd_elim_HRs are:"
# predictors HR_bkwd_elim
# 1 metastatic_cancer1 10.566096
# 2 ethnicgroup8 1.027209
# 3 ethnicgroup9 1.478646 <--
# [1] "added <pneumonia> back:"
# [1] "bkwd_elim_HRs are:"
# predictors HR_bkwd_elim
# 1 metastatic_cancer1 10.623099
# 2 ethnicgroup8 1.021901
# 3 ethnicgroup9 1.473842 <--
- looks like adding “cancer” variable back brought “metastatic_cancer” to HR=8.58 (original HR=8.32)
- but ethnicgroup HR’s did not change
- if bring “arrhythmias” or “pneumonia” back, “ethnicgroup9” HR goes to 1.47 (original HR=1.47)
- but now “metastatic_cancer” HR=10.62 again
- so how to deal with this when multiple variable’s HR changes after removing non-significant variables??
- seems like in final model, adding back “cancer”, “pneumonia”, “arrhythmias” will get both “metastatic_cancer” and “ethnicgroup” HR’s back to original
final model
final_vars <- c(vars_sig, "cancer", "pneumonia", "arrhythmias"); final_vars
## [1] "los" "age" "gender"
## [4] "dementia" "ihd" "valvular_disease"
## [7] "metastatic_cancer" "prior_dnas" "quintile"
## [10] "ethnicgroup" "cancer" "pneumonia"
## [13] "arrhythmias"
fmla <- as.formula(paste("Surv(fu_time, death) ~ ", paste(final_vars, collapse=" + "), sep = ""))
cox2_final_vars <- coxph(fmla, data = g2)
summary(cox2_final_vars)
## Call:
## coxph(formula = fmla, data = g2)
##
## n= 990, number of events= 489
## (10 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## los 0.012907 1.012990 0.003246 3.977 6.98e-05 ***
## age 0.061641 1.063580 0.005712 10.791 < 2e-16 ***
## gender2 -0.307789 0.735071 0.098645 -3.120 0.00181 **
## dementia1 0.578317 1.783036 0.187069 3.091 0.00199 **
## ihd1 0.182621 1.200359 0.095916 1.904 0.05692 .
## valvular_disease1 0.219619 1.245602 0.108938 2.016 0.04380 *
## metastatic_cancer1 2.191865 8.951892 0.405582 5.404 6.51e-08 ***
## prior_dnas 0.123649 1.131619 0.040357 3.064 0.00219 **
## quintile2 -0.349069 0.705345 0.157885 -2.211 0.02704 *
## quintile3 0.122582 1.130412 0.153731 0.797 0.42523
## quintile4 -0.004269 0.995741 0.152280 -0.028 0.97764
## quintile5 0.146682 1.157986 0.155145 0.945 0.34443
## ethnicgroup2 -0.309944 0.733488 0.363972 -0.852 0.39446
## ethnicgroup3 -0.838791 0.432233 0.417241 -2.010 0.04440 *
## ethnicgroup9 0.361710 1.435782 0.365495 0.990 0.32235
## ethnicgroup8 -0.002431 0.997572 0.231320 -0.011 0.99161
## cancer1 0.251735 1.286256 0.208534 1.207 0.22737
## pneumonia1 0.295212 1.343411 0.140933 2.095 0.03620 *
## arrhythmias1 -0.178878 0.836208 0.093920 -1.905 0.05683 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## los 1.0130 0.9872 1.0066 1.0195
## age 1.0636 0.9402 1.0517 1.0756
## gender2 0.7351 1.3604 0.6058 0.8919
## dementia1 1.7830 0.5608 1.2357 2.5727
## ihd1 1.2004 0.8331 0.9946 1.4486
## valvular_disease1 1.2456 0.8028 1.0061 1.5421
## metastatic_cancer1 8.9519 0.1117 4.0428 19.8220
## prior_dnas 1.1316 0.8837 1.0456 1.2248
## quintile2 0.7053 1.4177 0.5176 0.9612
## quintile3 1.1304 0.8846 0.8363 1.5279
## quintile4 0.9957 1.0043 0.7388 1.3420
## quintile5 1.1580 0.8636 0.8544 1.5695
## ethnicgroup2 0.7335 1.3633 0.3594 1.4969
## ethnicgroup3 0.4322 2.3136 0.1908 0.9792
## ethnicgroup9 1.4358 0.6965 0.7014 2.9390
## ethnicgroup8 0.9976 1.0024 0.6339 1.5698
## cancer1 1.2863 0.7775 0.8547 1.9357
## pneumonia1 1.3434 0.7444 1.0192 1.7708
## arrhythmias1 0.8362 1.1959 0.6956 1.0052
##
## Concordance= 0.715 (se = 0.012 )
## Likelihood ratio test= 243.8 on 19 df, p=<2e-16
## Wald test = 231 on 19 df, p=<2e-16
## Score (logrank) test = 251.9 on 19 df, p=<2e-16
testing assumptions
1-PH assumption
p-value
zph <- cox.zph(cox2_final_vars, transform = "km", global = TRUE)
# defaults to "km" and global = TRUE
print(zph)
## chisq df p
## los 0.0812 1 0.7757
## age 0.3726 1 0.5416
## gender 0.4983 1 0.4803
## dementia 3.8657 1 0.0493
## ihd 9.1344 1 0.0025
## valvular_disease 0.3868 1 0.5340
## metastatic_cancer 0.3979 1 0.5282
## prior_dnas 0.1459 1 0.7024
## quintile 2.1905 4 0.7008
## ethnicgroup 0.3046 4 0.9895
## cancer 3.4334 1 0.0639
## pneumonia 1.8834 1 0.1699
## arrhythmias 0.4609 1 0.4972
## GLOBAL 23.2724 19 0.2256
plot(zph)
library(survminer)
ggcoxzph(zph, var = final_vars,
font.main = 10,
font.x = c(10, "plain", "black"),
font.y = c(10, "plain", "black"),
font.tickslab = c(8, "plain", "black"))
- ihd had p-value of only 0.0032, should this var be removed?
- or should we rely on GLOBAL p-value?
- how come for some vars, there’s more positive residuals than negatives?
- e.g. los, dementia, valvular_disease, metastatic_cancer, prior_dnas, cancer
- only quintile and ethnicgroup had the opposite problem - more negative residuals than positive…
graphically with KM plot
- only works well for binary vars
km_fit <- survfit(Surv(fu_time, death) ~ gender, data = g2)
ggsurvplot(km_fit, data = g2)
2-outliers
influence of obs on HR (coefficients)
ggcoxdiagnostics(cox2_final_vars, type = "dfbeta",
linear.predictions = FALSE,
ggtheme = theme_bw())
distribution of deviance residuals
deviance_res <- residuals(cox2_final_vars, type = "deviance")
deviance_res_outside1.96SD <- deviance_res[abs(deviance_res) > 1.96]
deviance_res_outside1.96SD %>% length / deviance_res %>% length
## [1] 0.06262626
# [1] 0.06161616
# not bad, should be < 0.05
ggcoxdiagnostics(cox2_final_vars, type = "deviance",
linear.predictions = FALSE,
ggtheme = theme_bw())
3-linear relation, continuous var <-> outcome
# g2 %>% select(final_vars) %>% str()
ggcoxfunctional(Surv(fu_time, death) ~ age + log(age) + sqrt(age), data = g2)
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.
ggcoxfunctional(Surv(fu_time, death) ~ los + sqrt(los), data = g2) # can't log(los), it has some zeros, log(0) = -Inf
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.
ggcoxfunctional(Surv(fu_time, death) ~ prior_dnas + sqrt(prior_dnas), data = g2) # can't log(prior_dnas) neither, same problem
## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.
- why log() and sqrt() ???
- to see if after log() or sqrt() the relationship is more linear
- what if you can’t log() cuz there’s 0’s???
- what counts as linear relationship from these graphs…?
issues encountered
- choosing vars to include from the paper
- backward elimination
- what if multiple significant var HR changed > 0.05? and adding back non-significant var to restore HR restores either of significant var but not all significant vars???
- A: actually in final model if added back important non-significant vars, will correct their respective significant var’s HRs
- but why does adding back non-significant vars affect significant var’s HR in the first place???
- how are these variables “correlated”???
- I thought we don’t want correlated variables in the same model???
- choosing final set of predictors
- testing proportionality assumptions
- ihd had p-value of only 0.0032, should this var be removed? or should we just rely on GLOBAL p-value?
- how to interpret graph from Martingale residuals? what counts as linear relationship???
- why final set of predictors differ from paper
- {logistic vs survival analysis; different dataset}
EOF