week1_4_survfit_output_by

calculate survival estimates by hand (try t=0 to t=7)

library(survival)

# read in data
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()

# set variable types
g[,"fu_time"] %>% typeof()
fu_time <- g[,"fu_time"]; fu_time %>% head(20)
g[,"death"] %>% typeof()
death <- g[,"death"]; death %>% head(20)

## [1] 1000   31
## [1] "integer"
##  [1]  416  648  466  441  371   47  656   12  530  551 1059   90  538  289  334
## [16]    5   79  363  195   27
## [1] "integer"
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0

survfit() conf.type options

similar results if large samples

km_fit <- survfit(Surv(fu_time, death) ~ 1) # conf.type = "log" default
summary(km_fit, times = c(1:7))

## Call: survfit(formula = Surv(fu_time, death) ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.981        0.995
##     2    973       7    0.981 0.00435        0.972        0.989
##     3    963       5    0.976 0.00489        0.966        0.985
##     4    954       6    0.970 0.00546        0.959        0.980
##     5    945       5    0.964 0.00590        0.953        0.976
##     6    938       1    0.963 0.00598        0.952        0.975
##     7    933       1    0.962 0.00606        0.951        0.974

km_fit_plain <- survfit(Surv(fu_time, death) ~ 1, conf.type = "plain")
summary(km_fit_plain, times = c(1:7))

## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "plain")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.981        0.995
##     2    973       7    0.981 0.00435        0.972        0.989
##     3    963       5    0.976 0.00489        0.966        0.985
##     4    954       6    0.970 0.00546        0.959        0.980
##     5    945       5    0.964 0.00590        0.953        0.976
##     6    938       1    0.963 0.00598        0.952        0.975
##     7    933       1    0.962 0.00606        0.951        0.974

km_fit_loglog <- survfit(Surv(fu_time, death) ~ 1, conf.type = "log-log")
summary(km_fit_loglog, times = c(1:7))

## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "log-log")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    992      12    0.988 0.00346        0.979        0.993
##     2    973       7    0.981 0.00435        0.970        0.988
##     3    963       5    0.976 0.00489        0.964        0.984
##     4    954       6    0.970 0.00546        0.957        0.979
##     5    945       5    0.964 0.00590        0.951        0.974
##     6    938       1    0.963 0.00598        0.950        0.974
##     7    933       1    0.962 0.00606        0.949        0.973

calculate columns of summary.survfit() by hand, Greenwood’s plain method

g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(8)

## .
## 0 1 2 3 4 5 6 7 
## 3 9 7 5 6 5 1 1

seems at t=0, 3 people died already
but Surv() lumped t=0 and t=1 deaths together (3+9=12)

# get KM surv estimate vectors for the first few t's
len = 8

t_vec = seq(0,len-1); t_vec

# deaths (death == 1)
events = g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); events

# censored subjects (death == 0)
censored = g %>% filter(death == 0) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); censored

# n.risk = people still alive (didn't die or lost-to-follow-up)
# n.risk[t+1] = n.risk[t] - (death[t] + censored[t]),
# e.g. n.risk[t=2] = 1000 - (3 + 5) = 992
n.risk <- integer(length = len)
n.risk[1] <- 1000
for (t in 2:length(n.risk)){
  n.risk[t] <-  n.risk[t-1] - (events[t-1]+censored[t-1])
}; 
# n.risk

# proP_surv = proportion survived PAST time t
# proP_surv[t] = (n.risk[t] - death[t])/n.risk[t]
proP_surv <- numeric(length = len)
proP_surv[1] <- 1

# just to make the numbers match, somehow there were deaths + censored at t=0
# merged deaths/censored @ t=0 and t=1
n.event = c(0, (events[1]+events[2]), events[3:len])
n.cens = c(0, (censored[1]+censored[2]), censored[3:len])

for (t in 1:length(proP_surv)){
  proP_surv[t] <- (n.risk[t] - n.event[t])/n.risk[t]
};
# proP_surv

# S_t = probabilty of surviving PAST time t (aka cumulative survival probability)
# S_t[t] = S_t[t-1]*proP_surv[t]
S_t <- numeric(length = len)
S_t[1] <- 1
for (t in 2:length(S_t)){
  S_t[t] = S_t[t-1]*proP_surv[t]
};
# S_t

# estimate SE of survival probability estimate
# http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/BS704_Survival/BS704_Survival4.html
# https://www.lexjansen.com/pharmasug/2014/SP/PharmaSUG-2014-SP10.pdf
# SE at specific time, thus "point estimates"
# population method is the "Greenwoods" formula
# SE(S_t) = S_t*sqrt(sum(D_t/N_t*(N_t - D_t)))
# N_t = n.risk
# D_t = n.event
# S_t = survival probability

e_sum = numeric(length = len); e_sum
e_sum[1] <- 0
for (t in 2:length(e_sum)){
  e_sum[t] = e_sum[t-1] + n.event[t]/(n.risk[t]*(n.risk[t]-n.event[t]))
}; 
e_sum

# SE = S_t[t]*sqrt(e_sum[t])
SE <- numeric(length = len); SE
SE[1] <- NA
for (t in 2:length(SE)){
  SE[t] <- S_t[t]*sqrt(e_sum[t])
};
# SE just a litte bit off from result of survfit()...
# SE

# 95% LCI
LCI <- numeric(length = len); LCI
LCI[1] <- NA
for (t in 2:length(LCI)){
  LCI[t] <- S_t[t] - 1.96*SE[t]
};
# LCI

# 95% UCI
UCI <- numeric(length = len); UCI
UCI[1] <- NA
for (t in 2:length(UCI)){
  UCI[t] <- S_t[t] + 1.96*SE[t]
};
# UCI

km_table <- data.frame(t_vec, n.risk, events, censored, n.cens, n.event,
                       proP_surv, S_t, e_sum, SE, LCI, UCI); 
km_table_rounded <- 
  km_table %>% 
  mutate_at(vars(e_sum, SE), funs(round(., 5))) %>% 
  mutate_at(vars(-c(e_sum, SE)), funs(round(., 3)))

t_vec	n.risk	events	censored	n.cens	n.event	proP_surv	S_t	e_sum	SE	LCI	UCI
0	1000	3	5	0	0	1.000	1.000	0e+00	NA	NA	NA
1	992	9	10	15	12	0.988	0.988	1e-05	0.00347	0.981	0.995
2	973	7	3	3	7	0.993	0.981	2e-05	0.00436	0.972	0.989
3	963	5	4	4	5	0.995	0.976	3e-05	0.00490	0.966	0.985
4	954	6	3	3	6	0.994	0.970	3e-05	0.00547	0.959	0.980
5	945	5	2	2	5	0.995	0.964	4e-05	0.00590	0.953	0.976
6	938	1	4	4	1	0.999	0.963	4e-05	0.00599	0.952	0.975
7	933	1	3	3	1	0.999	0.962	4e-05	0.00607	0.950	0.974

week1_4_survfit_output_by_hand

Hippo

2020_05_25_13

calculate survival estimates by hand (try t=0 to t=7)

survfit() conf.type options

calculate columns of summary.survfit() by hand, Greenwood’s plain method

EOF