calculate survival estimates by hand (try t=0 to t=7)

library(survival)

# read in data
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()

# set variable types
g[,"fu_time"] %>% typeof()
fu_time <- g[,"fu_time"]; fu_time %>% head(20)
g[,"death"] %>% typeof()
death <- g[,"death"]; death %>% head(20)
## [1] 1000   31
## [1] "integer"
##  [1]  416  648  466  441  371   47  656   12  530  551 1059   90  538  289  334
## [16]    5   79  363  195   27
## [1] "integer"
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0

survfit() conf.type options

  • similar results if large samples
km_fit <- survfit(Surv(fu_time, death) ~ 1) # conf.type = "log" default
summary(km_fit, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.981        0.995
##     2    973       7    0.981 0.00435        0.972        0.989
##     3    963       5    0.976 0.00489        0.966        0.985
##     4    954       6    0.970 0.00546        0.959        0.980
##     5    945       5    0.964 0.00590        0.953        0.976
##     6    938       1    0.963 0.00598        0.952        0.975
##     7    933       1    0.962 0.00606        0.951        0.974
km_fit_plain <- survfit(Surv(fu_time, death) ~ 1, conf.type = "plain")
summary(km_fit_plain, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "plain")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.981        0.995
##     2    973       7    0.981 0.00435        0.972        0.989
##     3    963       5    0.976 0.00489        0.966        0.985
##     4    954       6    0.970 0.00546        0.959        0.980
##     5    945       5    0.964 0.00590        0.953        0.976
##     6    938       1    0.963 0.00598        0.952        0.975
##     7    933       1    0.962 0.00606        0.951        0.974
km_fit_loglog <- survfit(Surv(fu_time, death) ~ 1, conf.type = "log-log")
summary(km_fit_loglog, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "log-log")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.979        0.993
##     2    973       7    0.981 0.00435        0.970        0.988
##     3    963       5    0.976 0.00489        0.964        0.984
##     4    954       6    0.970 0.00546        0.957        0.979
##     5    945       5    0.964 0.00590        0.951        0.974
##     6    938       1    0.963 0.00598        0.950        0.974
##     7    933       1    0.962 0.00606        0.949        0.973

calculate columns of summary.survfit() by hand, Greenwood’s plain method

g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(8)
## .
## 0 1 2 3 4 5 6 7 
## 3 9 7 5 6 5 1 1
  • seems at t=0, 3 people died already
  • but Surv() lumped t=0 and t=1 deaths together (3+9=12)
# get KM surv estimate vectors for the first few t's
len = 8

t_vec = seq(0,len-1); t_vec

# deaths (death == 1)
events = g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); events

# censored subjects (death == 0)
censored = g %>% filter(death == 0) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); censored

# n.risk = people still alive (didn't die or lost-to-follow-up)
# n.risk[t+1] = n.risk[t] - (death[t] + censored[t]),
# e.g. n.risk[t=2] = 1000 - (3 + 5) = 992
n.risk <- integer(length = len)
n.risk[1] <- 1000
for (t in 2:length(n.risk)){
  n.risk[t] <-  n.risk[t-1] - (events[t-1]+censored[t-1])
}; 
# n.risk

# proP_surv = proportion survived PAST time t
# proP_surv[t] = (n.risk[t] - death[t])/n.risk[t]
proP_surv <- numeric(length = len)
proP_surv[1] <- 1

# just to make the numbers match, somehow there were deaths + censored at t=0
# merged deaths/censored @ t=0 and t=1
n.event = c(0, (events[1]+events[2]), events[3:len])
n.cens = c(0, (censored[1]+censored[2]), censored[3:len])

for (t in 1:length(proP_surv)){
  proP_surv[t] <- (n.risk[t] - n.event[t])/n.risk[t]
};
# proP_surv

# S_t = probabilty of surviving PAST time t (aka cumulative survival probability)
# S_t[t] = S_t[t-1]*proP_surv[t]
S_t <- numeric(length = len)
S_t[1] <- 1
for (t in 2:length(S_t)){
  S_t[t] = S_t[t-1]*proP_surv[t]
};
# S_t

# estimate SE of survival probability estimate
# http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/BS704_Survival/BS704_Survival4.html
# https://www.lexjansen.com/pharmasug/2014/SP/PharmaSUG-2014-SP10.pdf
# SE at specific time, thus "point estimates"
# population method is the "Greenwoods" formula
# SE(S_t) = S_t*sqrt(sum(D_t/N_t*(N_t - D_t)))
# N_t = n.risk
# D_t = n.event
# S_t = survival probability

e_sum = numeric(length = len); e_sum
e_sum[1] <- 0
for (t in 2:length(e_sum)){
  e_sum[t] = e_sum[t-1] + n.event[t]/(n.risk[t]*(n.risk[t]-n.event[t]))
}; 
e_sum

# SE = S_t[t]*sqrt(e_sum[t])
SE <- numeric(length = len); SE
SE[1] <- NA
for (t in 2:length(SE)){
  SE[t] <- S_t[t]*sqrt(e_sum[t])
};
# SE just a litte bit off from result of survfit()...
# SE

# 95% LCI
LCI <- numeric(length = len); LCI
LCI[1] <- NA
for (t in 2:length(LCI)){
  LCI[t] <- S_t[t] - 1.96*SE[t]
};
# LCI

# 95% UCI
UCI <- numeric(length = len); UCI
UCI[1] <- NA
for (t in 2:length(UCI)){
  UCI[t] <- S_t[t] + 1.96*SE[t]
};
# UCI

km_table <- data.frame(t_vec, n.risk, events, censored, n.cens, n.event,
                       proP_surv, S_t, e_sum, SE, LCI, UCI); 
km_table_rounded <- 
  km_table %>% 
  mutate_at(vars(e_sum, SE), funs(round(., 5))) %>% 
  mutate_at(vars(-c(e_sum, SE)), funs(round(., 3)))
t_vec n.risk events censored n.cens n.event proP_surv S_t e_sum SE LCI UCI
0 1000 3 5 0 0 1.000 1.000 0e+00 NA NA NA
1 992 9 10 15 12 0.988 0.988 1e-05 0.00347 0.981 0.995
2 973 7 3 3 7 0.993 0.981 2e-05 0.00436 0.972 0.989
3 963 5 4 4 5 0.995 0.976 3e-05 0.00490 0.966 0.985
4 954 6 3 3 6 0.994 0.970 3e-05 0.00547 0.959 0.980
5 945 5 2 2 5 0.995 0.964 4e-05 0.00590 0.953 0.976
6 938 1 4 4 1 0.999 0.963 4e-05 0.00599 0.952 0.975
7 933 1 3 3 1 0.999 0.962 4e-05 0.00607 0.950 0.974

EOF