get data

library(survival)
library(car)
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()
## [1] 1000   31
# g %>% head()
g$gender <- as.factor(g$gender)
g$copd <- as.factor(g$copd)
g$ethnicgroup <- as.factor(g$ethnicgroup)
levels(g$ethnicgroup) <- c(levels(g$ethnicgroup), "8")
g$ethnicgroup[is.na(g$ethnicgroup)] <- "8"
table(g$ethnicgroup, exclude = NULL)
## 
##   1   2   3   9   8 
## 889  17  34  17  43
# set quintile 0 to NAs, coxph will ignore these patients
g <- g %>% mutate(quintile = recode(quintile, "c('0')=NA"))
table(g$quintile, exclude = NULL)
## 
##    1    2    3    4    5 <NA> 
##  138  205  211  220  216   10
g$quintile <- as.factor(g$quintile)

test PH assumption

  • e.g. test if hazard is proportional for a given predictor (e.g. gender)
  • test graphically
    • when you plot the hazards for male vs female,
    • lines should be approximately parallel
    • at any time point, you can multiply hazard of male, get hazard of female
    • then the relationship can be summarized with 1 number, the HR
    • hazard functions should definitely NOT cross
  • test with p-value
    • using Schoenfeld residuals
    • high p-value is good, means residuals not correlated with time of event, proportional assumption valid
  • should do both graphically and with p-value
cox <- coxph(Surv(fu_time, death) ~ age + gender + copd + quintile + ethnicgroup, data = g)

zph <- cox.zph(cox, transform = "km", global = TRUE) # defaults to "km" and global = TRUE
print(zph)
##             chisq df    p
## age         0.151  1 0.70
## gender      0.743  1 0.39
## copd        0.143  1 0.71
## quintile    2.310  4 0.68
## ethnicgroup 0.809  4 0.94
## GLOBAL      4.432 11 0.96
plot(zph)

# run on gender only
cox_gender <- coxph(Surv(fu_time, death) ~ gender, data = g)

zph_gender <- cox.zph(cox_gender, transform = "km", global = TRUE) # defaults to "km" and global = TRUE
print(zph_gender)
##        chisq df    p
## gender  1.24  1 0.26
## GLOBAL  1.24  1 0.26
plot(zph_gender)

# residual is different if run gender only, vs multivariate
  • pvalue tests null model = if slope for regression line of residuals is 0
    • i.e. if slope = 0, accept null model, p-value > 0.05
  • pvalue only testing for linear relationship btw residuals and time,
    • can miss non-linear relationships
    • should check plots too
  • plots show scaled Schoenfeld residuals with time

  • more later on what to do if PH assumption NOT met

survminer ggcoxzph()

  • time is transformed…
library(survminer)
ggcoxzph(zph_gender)

KM plot

  • to visually check hazard functions
  • only works well for binary variables (e.g. gender)
km_fit <- survfit(Surv(fu_time, death) ~ gender, data = g)
plot(km_fit, xlab = "time", ylab = "survival probability")

deviance residuals

  • normalized transformation of Martingale residuals
  • mean 0, sd 1
  • helps to check for outliers/ influential data points (e.g. patient)
  • either
    • examine influence of each data point on coefficient, or
    • plot distribution of residuals against covariate

influence of each obs on regression coefficient

  • using survminer’s ggcoxdiagnostics(… type = “dfbeta”)
  • dfbeta plots estimated changes in regression coefficient on deleting each observation (patient) in turn
cox <- coxph(Surv(fu_time, death) ~ age, data = g)
ggcoxdiagnostics(cox, type = "dfbeta",
                 linear.predictions = FALSE,
                 ggtheme = theme_bw())

distribution of residuals against covariate

  • residuals SHOULD BE normally distributed (mean 0, SD 1)
    • then only 5% of observations should be 1.96 SD from mean
    • if > 5% observations 1.96 SD from mean,
      • then model doesn’t fit data as well, and
      • some observations are outliers
    • papers report “proportion of standardized residuals outside -1.96 to 1.96 range = e.g. 0.6%”
      • {shouldn’t this be for each covariate? or for all covariates…
      • nevermind, if multivariate then one residual per observation for all variables}
cox <- coxph(Surv(fu_time, death) ~ age, data = g)
ggcoxdiagnostics(cox, type = "deviance",
                 linear.predictions = FALSE,
                 ggtheme = theme_bw())

  • residuals = E - O
  • positive values - individuals expected/ predicted survival times > observed survival times
  • negative values - individuals expected/ predicted survival times < observed survival times
  • observations with very small or large residuals are outliers, poorly predicted by the model

Martingale residuals, linear relation

  • tests whether continuous variables have linear relation with outcome
  • “if you fit age as a single term in the model…” {what if other terms were included in the model?}
ggcoxfunctional(Surv(fu_time, death) ~ age + log(age) + sqrt(age), data = g)

# oh okay, age + log(age) + sqrt(age) just plots 3 separate graphs...
  • Martingale residuals range from 1 to -inf
    • MR near 1 = patients Exp death > Obs death
    • MR large neg = patients Exp death < Obs death

practice: test PH assumption for COPD

cox_copd <- coxph(Surv(fu_time, death) ~ copd, data = g)
zph_copd <- cox.zph(cox_copd, transform = "km", global = TRUE) # defaults to "km" and global = TRUE
print(zph_copd)
##        chisq df    p
## copd    1.24  1 0.27
## GLOBAL  1.24  1 0.27

EOF