get data

library(survival)
library(survminer)
library(car)
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()

predictors from paper

  • had: age, gender, ethnicgroup, ihd, stroke, valvular_disease, pvd, copd, pneumonia, hypertension, cancer, metastatic_cancer, senile, dementia, mental_health, prior_dnas, los, cabg
  • should have also had: renal_disease, cancer, combined senility and dementia
  • shouldn’t have had: cabg

  • cox model output, should check SE, should be < 1, even better if < 0.1

prep predictors

# combine senile and dementia into cog_imp, if either senile or dementia = 1, cog_imp = 1
g$cog_imp <- ifelse(g$senile == 1 | g$dementia == 1, 1, 0)
# g$cog_imp %>% table()

# get new df for paper predictors only
g_paper <- g %>% select(c(age, gender, ethnicgroup, ihd, valvular_disease, pvd, stroke, copd, pneumonia, hypertension, renal_disease, cancer, metastatic_cancer, mental_health, cog_imp, los, prior_dnas))

# convert binaries variables to factors
# g_paper %>% str()
g_paper <- g_paper %>% mutate_if(~ n_distinct(.) == 2, as.factor)

# deal with ethnicity NAs
g_paper$ethnicgroup <- as.factor(g_paper$ethnicgroup)
levels(g_paper$ethnicgroup) <- c(levels(g_paper$ethnicgroup), "8")
g_paper$ethnicgroup[is.na(g_paper$ethnicgroup)] <- "8"
# table(g_paper$ethnicgroup, exclude = NULL)

# attach the outcomes
g_paper$fu_time <- g$fu_time
g_paper$death <- g$death

g_paper %>% str()

continous variables

  • check if relationship with outcome is linear
  • plots variable (x) vs Martingale residuals (y) { why not plot variable vs fu_time ??? }
  • do this before cox modeling
ggcoxfunctional(Surv(fu_time, death) ~ age + log(age) + sqrt(age), data = g_paper)

ggcoxfunctional(Surv(fu_time, death) ~ los + sqrt(los), data = g_paper) 

# can't log(los), it has some zeros, log(0) = -Inf

ggcoxfunctional(Surv(fu_time, death) ~ prior_dnas + sqrt(prior_dnas), data = g_paper)

# can't log(prior_dnas) neither, same problem
# prior_dnas has a lot of zeros, values are not normally distributed { does it matter??? }

cox model

cox_paper <- coxph(Surv(fu_time, death) ~ ., data = g_paper)
# summary(cox_paper)

HR_allVars <- summary(cox_paper)$coefficients %>%
  as.data.frame() %>% 
  rownames_to_column("vars") %>%
  filter(`Pr(>|z|)` < 0.05) %>%
  select(vars, "HR_allVars" = `exp(coef)`)
HR_allVars
##                 vars HR_allVars
## 1                age  1.0600833
## 2            gender2  0.8057446
## 3         pneumonia1  1.3528890
## 4 metastatic_cancer1  8.9778235
## 5           cog_imp1  1.3873881
## 6                los  1.0116204
## 7         prior_dnas  1.1134596
  • SE are all low enough, would be better if < 0.1 {with larger sample size}

backward elimination

drop variables with p-value > 0.05

# get names of only significant variables
sigVars <- summary(cox_paper)$coefficients %>%
  as.data.frame() %>% 
  rownames_to_column("vars") %>%
  filter(`Pr(>|z|)` < 0.05) %>%
  select(vars) %>%
  unlist() %>% unname()
sigVars
## [1] "age"                "gender2"            "pneumonia1"        
## [4] "metastatic_cancer1" "cog_imp1"           "los"               
## [7] "prior_dnas"

new cox model with significant variables only

  • somehow course had age, gender, valvular, pneumonia, mets and cog_imp as significant
    • clearly wrong since output from first cox model is the same…
g_paper %>% str()
## 'data.frame':    1000 obs. of  19 variables:
##  $ age              : int  90 74 83 79 94 89 63 86 72 82 ...
##  $ gender           : Factor w/ 2 levels "1","2": 2 1 2 1 2 1 1 2 2 2 ...
##  $ ethnicgroup      : Factor w/ 5 levels "1","2","3","9",..: 5 1 1 1 1 5 1 1 1 1 ...
##  $ ihd              : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 1 1 1 ...
##  $ valvular_disease : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 1 1 1 ...
##  $ pvd              : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  $ stroke           : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 2 1 ...
##  $ copd             : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 2 1 ...
##  $ pneumonia        : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 1 ...
##  $ hypertension     : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 2 2 1 1 ...
##  $ renal_disease    : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ cancer           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ metastatic_cancer: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mental_health    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cog_imp          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ los              : int  2 10 3 1 17 47 3 12 2 2 ...
##  $ prior_dnas       : int  0 1 0 2 0 0 0 1 0 1 ...
##  $ fu_time          : int  416 648 466 441 371 47 656 12 530 551 ...
##  $ death            : int  0 0 0 0 0 0 0 0 0 0 ...
cox_sigVars <- coxph(Surv(fu_time, death) ~ age + gender + valvular_disease + pneumonia + metastatic_cancer + cog_imp, data = g_paper)
HR_sigVars <- summary(cox_sigVars)$coefficients %>%
  as.data.frame() %>%
  rownames_to_column("vars") %>%
  select(vars, "HR_sigVars" = `exp(coef)`)
HR_sigVars
##                 vars HR_sigVars
## 1                age   1.059630
## 2            gender2   0.755440
## 3  valvular_disease1   1.271553
## 4         pneumonia1   1.565284
## 5 metastatic_cancer1  12.208870
## 6           cog_imp1   1.430736

compare HR of original cox model and model with only significant vars

inner_join(HR_allVars, HR_sigVars, by = "vars")
##                 vars HR_allVars HR_sigVars
## 1                age  1.0600833   1.059630
## 2            gender2  0.8057446   0.755440
## 3         pneumonia1  1.3528890   1.565284
## 4 metastatic_cancer1  8.9778235  12.208870
## 5           cog_imp1  1.3873881   1.430736
  • only metastatic_cancer HR had dramatic change (8.98 to 12.20)
    • should we add back variables to try to get metastatic_cancer HR back to 8.98?
    • depends on how results is going to be used,
    • if you only care about finding significant predictors, then HR doesn’t really matter
    • { then course notes goes on to look at cog_imp while talking about metastatic_cancer… }
  • { skipped assumptions testing, no new info }

why final model differ from report

  1. data similar, not the same
    • data here is simulated from same distribution as original, not same data
  2. some variables omitted
  3. logistic regression vs survival analysis (different outcomes)
    • logistic regression used mortality within 1 year of admission as outcome
    • survival analysis follow-up period much longer than 1 year
  4. size of dataset of original paper much bigger (80,000 vs 1000)
    • smaller dataset = less power to detect significant relations

EOF