Code
library(tidyverse)
library(mlr3proba)
library(DT)
library(ggpubr)
library(ComplexHeatmap)
library(circlize)
We benchmark the two versions of the survival brier score (Graf et al. 1999), namely the Integrated Survival Brier Score (ISBS) and the proposed re-weighted version (RISBS) (see documentation details for their respective formulas). The first (ISBS) is not a proper scoring rule (Rindt et al. 2022), the second (RISBS) is (Sonabend et al. 2024). Our goal is to assess whether these scores exhibit differences in simulated and real-world datasets, and if so, to understand the reasons behind these differences.
Load libraries:
library(tidyverse)
library(mlr3proba)
library(DT)
library(ggpubr)
library(ComplexHeatmap)
library(circlize)
res.rds
object, contact the author via a GitHub issue.We simulate datasets with varying characteristics:
The ‘fixed’ parameters in our simulations are the following:
For each simulated dataset, we performed a simple train/test resampling (70%/30%). Each resampling was stratified using the status
variable so that the proportion of censoring remains the same in each respective train and test set.
We trained 3 models in each respective train set, namely the Kaplan-Meier, the Cox Proportional Hazards (CoxPH) model and an Accelerated Failure Time (AFT) model with Weibull distribution for the time-to-event output variable. We tested the performance of each model in each respective test set using the ISBS and RISBS measures, integrating up to the 80\% quantile of the event times of each train set.
Get the benchmark results:
= readRDS(file = "res.rds") res
We will divide the presentation of simulation data results in 4 sub-sections, according to:
For each combo of number of observations (n_obs
) and proportion of censoring (cens_prop
) variables (100 simulated datasets per combo), we calculate the following summary stats between RISBS and ISBS: Pearson correlation, mean absolute difference and its standard deviation, root mean square error (RMSE):
=
res_ph_ind |>
res drop_na() |> # exclude few datasets where AFT prediction didn't work
filter(prop_haz == TRUE, cens_dep == FALSE) |>
group_by(n_obs, cens_prop) |>
summarize(
.groups = "drop",
km_cor = cor(km_proper, km_improper),
cox_cor = cor(cox_proper, cox_improper),
aft_cor = cor(aft_proper, aft_improper),
km_diff_mean = mean(abs(km_proper - km_improper)),
cox_diff_mean = mean(abs(cox_proper - cox_improper)),
aft_diff_mean = mean(abs(aft_proper - aft_improper)),
km_diff_sd = sd(abs(km_proper - km_improper)),
cox_diff_sd = sd(abs(cox_proper - cox_improper)),
aft_diff_sd = sd(abs(aft_proper - aft_improper)),
km_rmse = sqrt(mean((km_proper - km_improper)^2)),
cox_rmse = sqrt(mean((cox_proper - cox_improper)^2)),
aft_rmse = sqrt(mean((aft_proper - aft_improper)^2))
)
|>
res_ph_ind datatable(
rownames = FALSE,
options = list(pageLength = 10, searching = TRUE)) |>
formatRound(columns = 2:14, digits = c(1, rep(2,3), rep(3,9)))
Visualizing the RMSE between RISBS and ISBS:
=
cox_mat |>
res_ph_ind select(n_obs, cens_prop, cox_rmse) |>
pivot_wider(names_from = cens_prop, values_from = cox_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
# color function
= round(max(c(res_ph_ind$km_rmse, res_ph_ind$cox_rmse, res_ph_ind$aft_rmse)), digits = 3) + 0.001
max_val = circlize::colorRamp2(c(0, max_val/2, max_val), c("#e0f3db", "#a8ddb5", "#43a2ca"))
col_fun
Heatmap(cox_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE,
row_names_side = "left", row_title = "#Observations", col = col_fun)
=
aft_mat |>
res_ph_ind select(n_obs, cens_prop, aft_rmse) |>
pivot_wider(names_from = cens_prop, values_from = aft_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(aft_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
km_mat |>
res_ph_ind select(n_obs, cens_prop, km_rmse) |>
pivot_wider(names_from = cens_prop, values_from = km_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(km_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
For each combo of number of observations (n_obs
) and proportion of censoring (cens_prop
) variables (100 simulated datasets per combo), we calculate the following summary stats between RISBS and ISBS: Pearson correlation, mean absolute difference and its standard deviation, root mean square error (RMSE):
=
res_ph_dep |>
res drop_na() |> # exclude few datasets where AFT prediction didn't work
filter(prop_haz == TRUE, cens_dep == TRUE) |>
group_by(n_obs, cens_prop) |>
summarize(
.groups = "drop",
km_cor = cor(km_proper, km_improper),
cox_cor = cor(cox_proper, cox_improper),
aft_cor = cor(aft_proper, aft_improper),
km_diff_mean = mean(abs(km_proper - km_improper)),
cox_diff_mean = mean(abs(cox_proper - cox_improper)),
aft_diff_mean = mean(abs(aft_proper - aft_improper)),
km_diff_sd = sd(abs(km_proper - km_improper)),
cox_diff_sd = sd(abs(cox_proper - cox_improper)),
aft_diff_sd = sd(abs(aft_proper - aft_improper)),
km_rmse = sqrt(mean((km_proper - km_improper)^2)),
cox_rmse = sqrt(mean((cox_proper - cox_improper)^2)),
aft_rmse = sqrt(mean((aft_proper - aft_improper)^2))
)
|>
res_ph_dep datatable(
rownames = FALSE,
options = list(pageLength = 10, searching = TRUE)) |>
formatRound(columns = 2:14, digits = c(1, rep(2,3), rep(3,9)))
Visualizing the RMSE between RISBS and ISBS:
=
cox_mat |>
res_ph_dep select(n_obs, cens_prop, cox_rmse) |>
pivot_wider(names_from = cens_prop, values_from = cox_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
# color function
= round(max(c(res_ph_dep$km_rmse, res_ph_dep$cox_rmse, res_ph_dep$aft_rmse)), digits = 3) + 0.001
max_val = circlize::colorRamp2(c(0, max_val/2, max_val), c("#e0f3db", "#a8ddb5", "#43a2ca"))
col_fun
Heatmap(cox_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
aft_mat |>
res_ph_dep select(n_obs, cens_prop, aft_rmse) |>
pivot_wider(names_from = cens_prop, values_from = aft_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(aft_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
km_mat |>
res_ph_dep select(n_obs, cens_prop, km_rmse) |>
pivot_wider(names_from = cens_prop, values_from = km_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(km_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
For each combo of number of observations (n_obs
) and proportion of censoring (cens_prop
) variables (100 simulated datasets per combo), we calculate the following summary stats between RISBS and ISBS: Pearson correlation, mean absolute difference and its standard deviation, root mean square error (RMSE):
=
res_noph_ind |>
res drop_na() |> # exclude few datasets where AFT prediction didn't work
filter(prop_haz == FALSE, cens_dep == FALSE) |>
group_by(n_obs, cens_prop) |>
summarize(
.groups = "drop",
km_cor = cor(km_proper, km_improper),
cox_cor = cor(cox_proper, cox_improper),
aft_cor = cor(aft_proper, aft_improper),
km_diff_mean = mean(abs(km_proper - km_improper)),
cox_diff_mean = mean(abs(cox_proper - cox_improper)),
aft_diff_mean = mean(abs(aft_proper - aft_improper)),
km_diff_sd = sd(abs(km_proper - km_improper)),
cox_diff_sd = sd(abs(cox_proper - cox_improper)),
aft_diff_sd = sd(abs(aft_proper - aft_improper)),
km_rmse = sqrt(mean((km_proper - km_improper)^2)),
cox_rmse = sqrt(mean((cox_proper - cox_improper)^2)),
aft_rmse = sqrt(mean((aft_proper - aft_improper)^2))
)
|>
res_noph_ind datatable(
rownames = FALSE,
options = list(pageLength = 10, searching = TRUE)) |>
formatRound(columns = 2:14, digits = c(1, rep(2,3), rep(3,9)))
Visualizing the RMSE between RISBS and ISBS:
=
cox_mat |>
res_noph_ind select(n_obs, cens_prop, cox_rmse) |>
pivot_wider(names_from = cens_prop, values_from = cox_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
# color function
= round(max(c(res_noph_ind$km_rmse, res_noph_ind$cox_rmse, res_noph_ind$aft_rmse)), digits = 3) + 0.001
max_val = circlize::colorRamp2(c(0, max_val/2, max_val), c("#e0f3db", "#a8ddb5", "#43a2ca"))
col_fun
Heatmap(cox_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
aft_mat |>
res_noph_ind select(n_obs, cens_prop, aft_rmse) |>
pivot_wider(names_from = cens_prop, values_from = aft_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(aft_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
km_mat |>
res_noph_ind select(n_obs, cens_prop, km_rmse) |>
pivot_wider(names_from = cens_prop, values_from = km_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(km_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
For each combo of number of observations (n_obs
) and proportion of censoring (cens_prop
) variables (100 simulated datasets per combo), we calculate the following summary stats between RISBS and ISBS: Pearson correlation, mean absolute difference and its standard deviation, root mean square error (RMSE):
=
res_noph_dep |>
res drop_na() |> # exclude few datasets where AFT prediction didn't work
filter(prop_haz == FALSE, cens_dep == TRUE) |>
group_by(n_obs, cens_prop) |>
summarize(
.groups = "drop",
km_cor = cor(km_proper, km_improper),
cox_cor = cor(cox_proper, cox_improper),
aft_cor = cor(aft_proper, aft_improper),
km_diff_mean = mean(abs(km_proper - km_improper)),
cox_diff_mean = mean(abs(cox_proper - cox_improper)),
aft_diff_mean = mean(abs(aft_proper - aft_improper)),
km_diff_sd = sd(abs(km_proper - km_improper)),
cox_diff_sd = sd(abs(cox_proper - cox_improper)),
aft_diff_sd = sd(abs(aft_proper - aft_improper)),
km_rmse = sqrt(mean((km_proper - km_improper)^2)),
cox_rmse = sqrt(mean((cox_proper - cox_improper)^2)),
aft_rmse = sqrt(mean((aft_proper - aft_improper)^2))
)
|>
res_noph_dep datatable(
rownames = FALSE,
options = list(pageLength = 10, searching = TRUE)) |>
formatRound(columns = 2:14, digits = c(1, rep(2,3), rep(3,9)))
Visualizing the RMSE between RISBS and ISBS:
=
cox_mat |>
res_noph_dep select(n_obs, cens_prop, cox_rmse) |>
pivot_wider(names_from = cens_prop, values_from = cox_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
# color function
= round(max(c(res_noph_dep$km_rmse, res_noph_dep$cox_rmse, res_noph_dep$aft_rmse)), digits = 3) + 0.001
max_val = circlize::colorRamp2(c(0, max_val/2, max_val), c("#e0f3db", "#a8ddb5", "#43a2ca"))
col_fun
Heatmap(cox_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
aft_mat |>
res_noph_dep select(n_obs, cens_prop, aft_rmse) |>
pivot_wider(names_from = cens_prop, values_from = aft_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(aft_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
=
km_mat |>
res_noph_dep select(n_obs, cens_prop, km_rmse) |>
pivot_wider(names_from = cens_prop, values_from = km_rmse) |>
arrange(desc(n_obs)) |>
column_to_rownames(var = "n_obs") |>
as.matrix()
Heatmap(km_mat, name = "RMSE", cluster_rows = FALSE, cluster_columns = FALSE,
column_title = "Censoring Proportion", column_title_side = "bottom",
column_labels = paste0(seq(from = 10, to = 80, by = 10), "%"),
column_names_rot = 0, column_names_centered = TRUE, col = col_fun,
row_names_side = "left", row_title = "#Observations")
mlr3
tasks and extract useful info, namely:
n_obs
: Number of observationsn_vars
: Number of total variablesn_factors
: Number of factor/categorical variablesn_numeric
: Number of numeric variablescens_prop
: Proportion of censoringadmin_cens_prop
: Proportion of censored observations that are censored administratively, i.e. at the last censoring timedep_cens_prop
: Proportion of significant coefficients (adjusted p < 0.05
) to predict censoring status using a logistic regression modelprop_haz
: If the dataset satisfies the proportional hazards assumption (p > 0.05
using the Grambsch-Therneau test (Grambsch and Therneau 1994))We used a total of 26 real-word, low-dimensional datasets (fewer features than observations) for benchmarking, some of which are freely available via various R
packages. For each dataset, we performed a simple train/test resampling (80%/20%) 100 times. Each resampling was stratified using the status
variable so that the proportion of censoring remains the same in each respective train and test set.
We trained 3 models in each respective train set, namely the Kaplan-Meier, the Cox Proportional Hazards (CoxPH) model and an Accelerated Failure Time (AFT) model with Weibull distribution for the time-to-event output variable. We tested the performance of each model in each respective test set using the ISBS and RISBS measures, integrating up to the 80\% quantile of the event times of each train set. We kept also the brier scores for each specific observation (observation-wise scores) in all respective test sets.
# task info, see `prepare_tasks.R`
= readRDS(file = "task_tbl.rds")
task_tbl
|>
task_tbl select(-task) |>
datatable(
rownames = FALSE,
options = list(pageLength = 13, searching = TRUE,
order = list(list(0, 'asc')))) |>
formatRound(columns = 6:8, digits = 2) |>
formatStyle(columns = 'prop_haz',
backgroundColor = styleEqual(c(TRUE, FALSE), c("#4DAF4A", "#E41A1C")))
Get the benchmark results:
# see `run_bench.R`
# Compressed file: https://github.com/survival-org/scoring-rules-2024/blob/main/bench_res.rds
= readRDS(file = "bench_res.rds") bench_res
We calculate the Pearson correlation and root mean square error (RMSE) between RISBS and ISBS scores per dataset (100 resamplings) and model:
=
score_corrs |>
bench_res group_by(task_id) |>
select(ends_with("proper")) |>
summarize(
km_cor = cor(km_proper, km_improper),
cox_cor = cor(cox_proper, cox_improper),
aft_cor = cor(aft_proper, aft_improper),
km_rmse = sqrt(mean((km_proper - km_improper)^2)),
cox_rmse = sqrt(mean((cox_proper - cox_improper)^2)),
aft_rmse = sqrt(mean((aft_proper - aft_improper)^2))
|>
) arrange(desc(cox_cor))
|>
score_corrs datatable(
rownames = FALSE,
options = list(pageLength = 13, searching = TRUE, order = list(list(2, 'desc')))
|>
) formatRound(columns = 2:7, digits = 3)
Mean and standard deviation of the Pearson’s correlation and RMSE across all datasets:
|>
score_corrs summarise(across(-task_id, list(mean = mean, sd = sd))) |>
pivot_longer(cols = everything(),
names_to = c("model", "statistic", ".value"),
names_pattern = "(.*)_(.*)_(.*)"
|>
) ::kable(digits = 3) knitr
model | statistic | mean | sd |
---|---|---|---|
km | cor | 0.967 | 0.089 |
cox | cor | 0.973 | 0.039 |
aft | cor | 0.973 | 0.039 |
km | rmse | 0.007 | 0.008 |
cox | rmse | 0.007 | 0.007 |
aft | rmse | 0.007 | 0.007 |
Scatter plots of RISBS vs ISBS scores per dataset (100 dots/resamplings per figure). We add the regression line in each plot. Datasets are ordered by decreasing correlation between the two metrics:
# order is by decreasing correlation
for (id in score_corrs$task_id) {
=
p |>
bench_res filter(task_id == id) |>
select(ends_with("proper")) |>
pivot_longer(cols = everything(), names_to = c("model", ".value"), names_pattern = "(.*)_(.*)") |>
mutate(model = factor(model, levels = c("km", "cox", "aft"))) |>
# scatter plot with Pearson's coef.
::ggscatter(
ggpubrx = "proper", y = "improper",
facet.by = c("model"),
panel.labs = list(model = c("Kaplan-Meier", "CoxPH", "AFT (Weibull)")),
xlab = "RISBS (proper)",
ylab = "ISBS (improper)",
color = "black", shape = 21, size = 2,
add = "reg.line", # Add regression line
add.params = list(color = "blue", fill = "lightgray"), # Customize regr. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coeff.args = list(method = "pearson", label.sep = "\n")
+
) labs(title = id) +
theme(panel.spacing = unit(1, "cm"))
print(p)
}
Scatter plots of RISBS vs ISBS observation-wise scores per dataset. Every figure has a total of dots equal to (100 resamplings) x (number of test observations in each resampling). We draw the reference y = x line in each plot. Datasets are ordered by decreasing proportion of censoring after applying the 80\% quantile t_max
cutoff in each full dataset.
= vapply(task_tbl$task, function(task) {
cens_props = task$unique_event_times()
event_times = unname(quantile(event_times, probs = 0.8))
t_max = task$truth()
truth = truth[, 1]
times = truth[, 2]
status = status[times <= t_max]
status_tmax sum(status_tmax == 0)/length(status_tmax) # censoring proportion
numeric(1))
},
names(cens_props) = task_tbl$task_id
= names(sort(cens_props, decreasing = TRUE))
ids
for (id in ids) {
=
p |>
bench_res filter(task_id == id) |>
select(ends_with("scores"), test_status) |> # `test_status` is the censoring status for coloring
unnest(cols = everything()) |>
pivot_longer(cols = ends_with("scores"), names_to = c("model", "type", ".value"), names_pattern = "(.*)_(.*)_(.*)") |>
pivot_wider(names_from = type, values_from = scores, values_fn = list) |>
unnest(cols = everything()) |>
rename(Status = test_status) |>
mutate(Status = case_when(Status == 0 ~ "Censored", TRUE ~ "Event")) |>
mutate(
Status = as.factor(Status),
model = factor(model, levels = c("km", "cox", "aft"))
|>
) # scatter plot with Pearson's coef.
::ggscatter(
ggpubrx = "proper", y = "improper",
facet.by = c("model"),
panel.labs = list(model = c("Kaplan-Meier", "CoxPH", "AFT (Weibull)")),
xlab = "RISBS (proper)",
ylab = "ISBS (improper)",
color = "Status", shape = 21, size = 2, alpha = 0.8,
palette = c("Censored" = "#377eb8", "Event" = "#e41a1c"),
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coeff.args = list(method = "pearson", label.sep = "\n")
+
) labs(title = id) +
geom_abline(slope = 1, size = 0.8) +
theme(panel.spacing = unit(1, "cm"))
print(p)
}
Note that in the 4 last datasets we have censored observations with t > t_{max} so they are excluded from the calculation of the brier scores as we integrate up to the t_max
time horizon cutoff. Therefore only observations that have experienced the event contribute to the scores (but the estimation of the censoring distribution using the Kaplan-Meier uses all observations of a train set).
Results only for the cox model and two datasets:
# RISBS vs ISBS
=
scores |>
bench_res filter(task_id == "veteran") |>
select(cox_proper, cox_improper) # 100 resamplings
= sqrt(mean((scores$cox_proper - scores$cox_improper)^2))
RMSE mean(abs(scores$cox_proper - scores$cox_improper))
[1] 0.006368727
sd(abs(scores$cox_proper - scores$cox_improper))
[1] 0.006689354
= ggpubr::ggscatter(data = scores, x = "cox_proper", y = "cox_improper",
p1 xlab = "RISBS (proper)", ylab = "ISBS (improper)",
shape = 21, size = 2, alpha = 0.8,
add = "reg.line", # Add regression line
add.params = list(color = "blue", fill = "lightgray"), # Customize regr. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coef.size = 8,
cor.coeff.args = list(method = "pearson", label.sep = "\n"),
ggtheme = theme_classic(base_size = 18)
+
) labs(title = "VETERAN") +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", size = 8, x = 0.175, y = 0.12, hjust = 0,
label = paste0("RMSE = ", round(RMSE, digits = 3)))
p1
# ggsave(filename = "p1.pdf", width = 7, height = 5)
# per observation scores
=
obs_scores |>
bench_res filter(task_id == "veteran") |>
select(cox_proper_scores, cox_improper_scores, test_status) |> # `test_status` is the censoring status for coloring
rename(proper_scores = cox_proper_scores, improper_scores = cox_improper_scores) |>
unnest(cols = everything()) |>
pivot_longer(cols = ends_with("scores"), names_to = c("type", ".value"), names_pattern = "(.*)_(.*)") |>
pivot_wider(names_from = type, values_from = scores, values_fn = list) |>
unnest(cols = everything()) |>
rename(Status = test_status) |>
mutate(Status = as.factor(case_when(Status == 0 ~ "Censored", TRUE ~ "Event")))
# 18 - 28 test observations after `t_max` cutoff per resampling (28 is 137 * 20%)
= bench_res |> filter(task_id == "veteran") |> pull(test_status)
status map(status, length) |> unlist() |> summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
18.00 21.00 23.00 23.23 25.00 28.00
# truth: 2143 events, 180 censored
$Status |> table() obs_scores
Censored Event
180 2143
= sqrt(mean((obs_scores$proper - obs_scores$improper)^2))
RMSE_all = sqrt(mean((
RMSE_event |> filter(Status == "Event") |> pull(proper) -
obs_scores |> filter(Status == "Event") |> pull(improper))^2))
obs_scores = sqrt(mean((
RMSE_cens |> filter(Status == "Censored") |> pull(proper) -
obs_scores |> filter(Status == "Censored") |> pull(improper))^2))
obs_scores
= ggpubr::ggscatter(data = obs_scores, x = "proper", y = "improper",
p2 xlab = "RISBS (proper)", ylab = "ISBS (improper)",
color = "Status", shape = 21, size = 3, alpha = 0.8,
palette = c("Censored" = "#377eb8", "Event" = "#e41a1c"),
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coef.size = 8,
cor.coeff.args = list(method = "pearson"),
ggtheme = theme_classic(base_size = 18)
+
) labs(title = "VETERAN") +
theme(legend.position = "none") +
geom_abline(slope = 1, size = 0.8) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", size = 6, x = 0.35, y = c(0.25, 0.2, 0.15), hjust = 0,
label = c(paste0("RMSE (all) = ", round(RMSE_all, digits = 3)),
paste0("RMSE (Event) = ", round(RMSE_event, digits = 3)),
paste0("RMSE (Censored) = ", round(RMSE_cens, digits = 3))))
p2
# ggsave(filename = "p2.pdf", width = 7, height = 5)
# RISBS vs ISBS
=
scores |>
bench_res filter(task_id == "nafld1") |>
select(cox_proper, cox_improper) # 100 resamplings
= sqrt(mean((scores$cox_proper - scores$cox_improper)^2))
RMSE mean(abs(scores$cox_proper - scores$cox_improper))
[1] 0.002702623
sd(abs(scores$cox_proper - scores$cox_improper))
[1] 0.0008136839
= ggpubr::ggscatter(data = scores, x = "cox_proper", y = "cox_improper",
p3 xlab = "RISBS (proper)", ylab = "ISBS (improper)",
shape = 21, size = 2, alpha = 0.8,
add = "reg.line", # Add regression line
add.params = list(color = "blue", fill = "lightgray"), # Customize regr. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coef.size = 8,
cor.coeff.args = list(method = "pearson", label.sep = "\n"),
ggtheme = theme_classic(base_size = 18)
+
) labs(title = "NAFLD") +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", size = 8, x = 0.044, y = 0.04, hjust = 0,
label = paste0("RMSE = ", round(RMSE, digits = 3)))
p3
# ggsave(filename = "p3.pdf", width = 7, height = 5)
# per observation scores
=
obs_scores |>
bench_res filter(task_id == "nafld1") |>
select(cox_proper_scores, cox_improper_scores, test_status) |> # `test_status` is the censoring status for coloring
rename(proper_scores = cox_proper_scores, improper_scores = cox_improper_scores) |>
unnest(cols = everything()) |>
pivot_longer(cols = ends_with("scores"), names_to = c("type", ".value"), names_pattern = "(.*)_(.*)") |>
pivot_wider(names_from = type, values_from = scores, values_fn = list) |>
unnest(cols = everything()) |>
rename(Status = test_status) |>
mutate(Status = as.factor(case_when(Status == 0 ~ "Censored", TRUE ~ "Event")))
# 509 - 597 test observations after `t_max` cutoff per resampling (it should
# have been 800 = 4000 * 20%, so now we got less observations contributing to
# the scores and given 92% censoring, we know that most of these are censored obs :)
= bench_res |> filter(task_id == "nafld1") |> pull(test_status)
status map(status, length) |> unlist() |> summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
509.0 541.0 556.5 553.8 566.2 597.0
# truth: 5181 events, 50194 censored
$Status |> table() obs_scores
Censored Event
50194 5181
= sqrt(mean((obs_scores$proper - obs_scores$improper)^2))
RMSE_all = sqrt(mean((
RMSE_event |> filter(Status == "Event") |> pull(proper) -
obs_scores |> filter(Status == "Event") |> pull(improper))^2))
obs_scores = sqrt(mean((
RMSE_cens |> filter(Status == "Censored") |> pull(proper) -
obs_scores |> filter(Status == "Censored") |> pull(improper))^2))
obs_scores
= ggpubr::ggscatter(data = obs_scores, x = "proper", y = "improper",
p4 xlab = "RISBS (proper)", ylab = "ISBS (improper)",
color = "Status", shape = 21, size = 3, alpha = 0.8,
palette = c("Censored" = "#377eb8", "Event" = "#e41a1c"),
cor.coef = TRUE, # Add Pearson's correlation coefficient
cor.coef.size = 8,
cor.coeff.args = list(method = "pearson"),
ggtheme = theme_classic(base_size = 18)
+
) labs(title = "NAFLD") +
geom_abline(slope = 1, size = 0.8) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", size = 5, x = 0.4, y = c(0.22, 0.16, 0.10), hjust = 0,
label = c(paste0("RMSE (all) = ", round(RMSE_all, digits = 3)),
paste0("RMSE (Event) = ", round(RMSE_event, digits = 3)),
paste0("RMSE (Censored) = ", round(RMSE_cens, digits = 3))))
p4
# ggsave(filename = "p4.pdf", width = 7, height = 5)