calculate survival estimates by hand (try t=0 to t=7)
library(survival)
# read in data
g <- read.csv("GMPH_1K_final_2.csv", header = TRUE, sep = ",")
g %>% dim()
# set variable types
g[,"fu_time"] %>% typeof()
fu_time <- g[,"fu_time"]; fu_time %>% head(20)
g[,"death"] %>% typeof()
death <- g[,"death"]; death %>% head(20)
## [1] 1000 31
## [1] "integer"
## [1] 416 648 466 441 371 47 656 12 530 551 1059 90 538 289 334
## [16] 5 79 363 195 27
## [1] "integer"
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0
survfit() conf.type options
- similar results if large samples
km_fit <- survfit(Surv(fu_time, death) ~ 1) # conf.type = "log" default
summary(km_fit, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 992 12 0.988 0.00346 0.981 0.995
## 2 973 7 0.981 0.00435 0.972 0.989
## 3 963 5 0.976 0.00489 0.966 0.985
## 4 954 6 0.970 0.00546 0.959 0.980
## 5 945 5 0.964 0.00590 0.953 0.976
## 6 938 1 0.963 0.00598 0.952 0.975
## 7 933 1 0.962 0.00606 0.951 0.974
km_fit_plain <- survfit(Surv(fu_time, death) ~ 1, conf.type = "plain")
summary(km_fit_plain, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "plain")
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 992 12 0.988 0.00346 0.981 0.995
## 2 973 7 0.981 0.00435 0.972 0.989
## 3 963 5 0.976 0.00489 0.966 0.985
## 4 954 6 0.970 0.00546 0.959 0.980
## 5 945 5 0.964 0.00590 0.953 0.976
## 6 938 1 0.963 0.00598 0.952 0.975
## 7 933 1 0.962 0.00606 0.951 0.974
km_fit_loglog <- survfit(Surv(fu_time, death) ~ 1, conf.type = "log-log")
summary(km_fit_loglog, times = c(1:7))
## Call: survfit(formula = Surv(fu_time, death) ~ 1, conf.type = "log-log")
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 992 12 0.988 0.00346 0.979 0.993
## 2 973 7 0.981 0.00435 0.970 0.988
## 3 963 5 0.976 0.00489 0.964 0.984
## 4 954 6 0.970 0.00546 0.957 0.979
## 5 945 5 0.964 0.00590 0.951 0.974
## 6 938 1 0.963 0.00598 0.950 0.974
## 7 933 1 0.962 0.00606 0.949 0.973
calculate columns of summary.survfit() by hand, Greenwood’s plain method
g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(8)
## .
## 0 1 2 3 4 5 6 7
## 3 9 7 5 6 5 1 1
- seems at t=0, 3 people died already
- but Surv() lumped t=0 and t=1 deaths together (3+9=12)
# get KM surv estimate vectors for the first few t's
len = 8
t_vec = seq(0,len-1); t_vec
# deaths (death == 1)
events = g %>% filter(death == 1) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); events
# censored subjects (death == 0)
censored = g %>% filter(death == 0) %>% select(fu_time) %>% table() %>% head(len) %>% as.numeric(); censored
# n.risk = people still alive (didn't die or lost-to-follow-up)
# n.risk[t+1] = n.risk[t] - (death[t] + censored[t]),
# e.g. n.risk[t=2] = 1000 - (3 + 5) = 992
n.risk <- integer(length = len)
n.risk[1] <- 1000
for (t in 2:length(n.risk)){
n.risk[t] <- n.risk[t-1] - (events[t-1]+censored[t-1])
};
# n.risk
# proP_surv = proportion survived PAST time t
# proP_surv[t] = (n.risk[t] - death[t])/n.risk[t]
proP_surv <- numeric(length = len)
proP_surv[1] <- 1
# just to make the numbers match, somehow there were deaths + censored at t=0
# merged deaths/censored @ t=0 and t=1
n.event = c(0, (events[1]+events[2]), events[3:len])
n.cens = c(0, (censored[1]+censored[2]), censored[3:len])
for (t in 1:length(proP_surv)){
proP_surv[t] <- (n.risk[t] - n.event[t])/n.risk[t]
};
# proP_surv
# S_t = probabilty of surviving PAST time t (aka cumulative survival probability)
# S_t[t] = S_t[t-1]*proP_surv[t]
S_t <- numeric(length = len)
S_t[1] <- 1
for (t in 2:length(S_t)){
S_t[t] = S_t[t-1]*proP_surv[t]
};
# S_t
# estimate SE of survival probability estimate
# http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/BS704_Survival/BS704_Survival4.html
# https://www.lexjansen.com/pharmasug/2014/SP/PharmaSUG-2014-SP10.pdf
# SE at specific time, thus "point estimates"
# population method is the "Greenwoods" formula
# SE(S_t) = S_t*sqrt(sum(D_t/N_t*(N_t - D_t)))
# N_t = n.risk
# D_t = n.event
# S_t = survival probability
e_sum = numeric(length = len); e_sum
e_sum[1] <- 0
for (t in 2:length(e_sum)){
e_sum[t] = e_sum[t-1] + n.event[t]/(n.risk[t]*(n.risk[t]-n.event[t]))
};
e_sum
# SE = S_t[t]*sqrt(e_sum[t])
SE <- numeric(length = len); SE
SE[1] <- NA
for (t in 2:length(SE)){
SE[t] <- S_t[t]*sqrt(e_sum[t])
};
# SE just a litte bit off from result of survfit()...
# SE
# 95% LCI
LCI <- numeric(length = len); LCI
LCI[1] <- NA
for (t in 2:length(LCI)){
LCI[t] <- S_t[t] - 1.96*SE[t]
};
# LCI
# 95% UCI
UCI <- numeric(length = len); UCI
UCI[1] <- NA
for (t in 2:length(UCI)){
UCI[t] <- S_t[t] + 1.96*SE[t]
};
# UCI
km_table <- data.frame(t_vec, n.risk, events, censored, n.cens, n.event,
proP_surv, S_t, e_sum, SE, LCI, UCI);
km_table_rounded <-
km_table %>%
mutate_at(vars(e_sum, SE), funs(round(., 5))) %>%
mutate_at(vars(-c(e_sum, SE)), funs(round(., 3)))
0 |
1000 |
3 |
5 |
0 |
0 |
1.000 |
1.000 |
0e+00 |
NA |
NA |
NA |
1 |
992 |
9 |
10 |
15 |
12 |
0.988 |
0.988 |
1e-05 |
0.00347 |
0.981 |
0.995 |
2 |
973 |
7 |
3 |
3 |
7 |
0.993 |
0.981 |
2e-05 |
0.00436 |
0.972 |
0.989 |
3 |
963 |
5 |
4 |
4 |
5 |
0.995 |
0.976 |
3e-05 |
0.00490 |
0.966 |
0.985 |
4 |
954 |
6 |
3 |
3 |
6 |
0.994 |
0.970 |
3e-05 |
0.00547 |
0.959 |
0.980 |
5 |
945 |
5 |
2 |
2 |
5 |
0.995 |
0.964 |
4e-05 |
0.00590 |
0.953 |
0.976 |
6 |
938 |
1 |
4 |
4 |
1 |
0.999 |
0.963 |
4e-05 |
0.00599 |
0.952 |
0.975 |
7 |
933 |
1 |
3 |
3 |
1 |
0.999 |
0.962 |
4e-05 |
0.00607 |
0.950 |
0.974 |