Code
library(dplyr)
library(tibble)
library(ggplot2)
library(patchwork)
library(akima)
library(purrr)
library(tidyr)
library(DT)Load libraries:
library(dplyr)
library(tibble)
library(ggplot2)
library(patchwork)
library(akima)
library(purrr)
library(tidyr)
library(DT)To illustrate the diversity of the Weibull distributions used in our simulation design, we plot both the density f(t) and survival S(t) functions for 10 randomly sampled shape \alpha and scale \sigma parameter pairs. These distributions represent either the true survival, predicted survival, or censoring times across simulations. The wide parameter range induces substantial variability, ensuring a rich family of distributions and increasing the sensitivity of our evaluation to detect violations of properness. By construction, all data-generating mechanisms assume Y independent of C, satisfying the random censoring assumption.
set.seed(20250408)
shape = runif(10, 0.5, 5)
scale = runif(10, 0.5, 5)
dw = mapply(function(shape,scale) dweibull(seq.int(1,5,0.1), shape,scale),shape,scale) |>
reshape2::melt() |>
mutate(x = rep(seq.int(1,5,0.1), 10),
group = rep(sprintf("(%.2f,%.2f)",shape,scale), each = 41)) |>
select(x, group, value)
pw = mapply(function(shape,scale) pweibull(seq.int(1,5,0.1), shape,scale, lower.tail=F),shape,scale) |>
reshape2::melt() |>
mutate(x = rep(seq.int(1,5,0.1), 10),
group = rep(sprintf("(%.2f,%.2f)",shape,scale), each = 41)) |>
select(x, group, value)
(ggplot(dw, aes(x = x, y = value, group = group, color = group)) + ylab("f(t)") + xlab("t")) /
(ggplot(pw, aes(x = x, y = value, group = group, color = group)) + ylab("S(t)") + xlab("t")) +
plot_layout(guides='collect') &
geom_line() &
theme_classic(base_size = 14) &
guides(color = guide_legend(title='(α,σ)')) &
scale_color_viridis_d(option = "turbo")
#ggsave("weibull_plots.png", device = png, dpi = 600, width = 7, height = 4)This section supports the “Simulation Experiments” section of the paper.
Use properness_test.R to run the experiments. To run for different sample sizes n, use run_tests.sh. Merged simulation results for all n are available here.
Explore data:
res = readRDS("results/res_sims10000_distrs1000_0.rds")
res[sample(1:nrow(res), 10), ] |>
DT::datatable(rownames = FALSE, options = list(searching = FALSE)) |>
formatRound(columns = 3:40, digits = c(rep(3,8), rep(5,30)))Each row corresponds to one simulation, using m = 1000 draws from Weibull survival distributions at a fixed sample size n \in {10, \dots, 10\,000}.
The output columns are:
sim: simulation index (k in the paper)n: sample sizen_events: number of uncensored events observed in the simulated datasetmax_t: maximum observed follow-up timem_true, m_pred: true and predicted mean survival times from the generating Weibull modelsm_true_est, m_pred_est: estimated mean survival times using the trapezoidal rule on the observed time point supportS_Y_q10, S_Y_med, S_Y_q90: values of the true event-time survival function S_Y(t) evaluated at the 10th percentile, median, and 90th percentile of the observed time points.S_C_q10, S_C_med, S_C_q90: Values of the censoring survival function S_C(t) evaluated at the 10th percentile, median, and 90th percentile of the observed time points.S_Y_tail, S_C_tail: tail survival probability at max_t for event and censoring distributionsepsilon: \varepsilon = S_Y(t_{max}) \times S_C(t_{max}){surv|pred|cens|}_{scale|shape} the scale and shape of the Weibull distributions for true, predicted and censoring times respectivelyprop_cens: proportion of censoring in each simulationtv_dist: the total variation distance between the true and the predicted Weibull distribution (closer to 0 means more similar distributions){score}_diff, {score}_sd: the mean and standard deviation of the score difference D between true and predicted distributions across drawsScoring rules analyzed:
We compute 95% confidence intervals (CIs) for the mean score difference D (true - predicted) using a t-distribution. A violation is marked statistically significant if: - The mean score difference exceeds a positive threshold: \bar{D} > 0.0001 - The CI excludes zero (i.e., \text{CI}_\text{lower} > 0)
res = res |>
select(!matches("shape|scale|prop_cens|tv_dist|n_events|max_t|m_true|m_pred"))
measures = c("SBS_q10", "SBS_median", "SBS_q90", "ISBS", "RCLL", "wRCLL")
n_distrs = 1000 # `m` value from paper's experiment
tcrit = qt(0.975, df = n_distrs - 1) # 95% t-test
threshold = 1e-4
data = measures |>
lapply(function(m) {
mean_diff_col = paste0(m, "_diff")
mean_sd_col = paste0(m, "_sd")
res |>
select(sim, n, !!mean_diff_col, !!mean_sd_col, starts_with("shift_"),
epsilon, starts_with("S_")) |>
select(-ends_with("tail")) |>
mutate(
se = .data[[mean_sd_col]] / sqrt(n_distrs),
CI_lower = .data[[mean_diff_col]] - tcrit * se,
CI_upper = .data[[mean_diff_col]] + tcrit * se,
signif_violation = .data[[mean_diff_col]] > threshold & CI_lower > 0,
# Keep relevant SBS columns
shift_q10 = if (m == "SBS_q10") shift_q10 else NA_real_,
S_Y_q10 = if (m == "SBS_q10") S_Y_q10 else NA_real_,
S_C_q10 = if (m == "SBS_q10") S_C_q10 else NA_real_,
shift_med = if (m == "SBS_median") shift_med else NA_real_,
S_Y_med = if (m == "SBS_median") S_Y_med else NA_real_,
S_C_med = if (m == "SBS_median") S_C_med else NA_real_,
shift_q90 = if (m == "SBS_q90") shift_q90 else NA_real_,
S_Y_q90 = if (m == "SBS_q90") S_Y_q90 else NA_real_,
S_C_q90 = if (m == "SBS_q90") S_C_q90 else NA_real_
) |>
select(!se) |>
mutate(metric = !!m) |>
relocate(metric, .after = 1) |>
rename(diff = !!mean_diff_col,
sd = !!mean_sd_col)
}) |>
bind_rows()
data$metric = factor(
data$metric,
levels = measures,
labels = c("SBS (Q10)", "SBS (Median)", "SBS (Q90)", "ISBS", "RCLL", "RCLL*")
)For example, the results for the first simulation and n = 250 across all scoring rules show no violations of properness:
data |>
filter(sim == 1, n == 250) |>
select(-starts_with("shift"), -epsilon, -starts_with("S_")) |>
DT::datatable(rownames = FALSE, options = list(searching = FALSE)) |>
formatRound(columns = 4:7, digits = 5)We summarize violations across simulations (sim), by computing for each score & sample size:
all_stats =
data |>
group_by(n, metric) |>
summarize(
n_violations = sum(signif_violation),
violation_rate = mean(signif_violation),
diff_mean = if (any(signif_violation)) mean(diff[signif_violation]) else NA_real_,
diff_median = if (any(signif_violation)) median(diff[signif_violation]) else NA_real_,
diff_min = if (any(signif_violation)) min(diff[signif_violation]) else NA_real_,
diff_max = if (any(signif_violation)) max(diff[signif_violation]) else NA_real_,
.groups = "drop"
)
all_stats |>
arrange(metric) |>
DT::datatable(
rownames = FALSE,
options = list(searching = FALSE),
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: center; font-size:150%",
"Table 1: Empirical violations of properness")
) |>
formatRound(columns = 4:8, digits = 5)The theoretical shift (i.e. bias) of the global minimum x^* of the expected SBS from the true survival probability is the survival tail product \times a survival probability ratio term at \tau^* (for small \varepsilon > 0):
x^\star - S_Y(\tau^*) = - \frac{\varepsilon (1-S_Y(\tau^*))}{S_C(\tau^*) - \varepsilon} \approx - \varepsilon \cdot \frac{1 - S_Y(\tau^*)}{S_C(\tau^*)}
Filter SBS simulation data for plotting:
plot_data = data |>
filter(grepl("^SBS", metric)) |>
pivot_longer(
cols = starts_with("shift_"),
names_to = "shift_type",
values_to = "shift",
values_drop_na = TRUE # only keep the relevant shift per metric
) |>
mutate(
S_Y = case_when(
metric == "SBS (Q10)" ~ S_Y_q10,
metric == "SBS (Median)" ~ S_Y_med,
metric == "SBS (Q90)" ~ S_Y_q90
),
S_C = case_when(
metric == "SBS (Q10)" ~ S_C_q10,
metric == "SBS (Median)" ~ S_C_med,
metric == "SBS (Q90)" ~ S_C_q90
)
) |>
select(-S_Y_q10, -S_Y_med, -S_Y_q90, -S_C_q10, -S_C_med, -S_C_q90)ann_df = data.frame(
metric = unique(plot_data$metric)[1],
x = 0.01,
y = -0.07,
xend = 0.01,
yend = -0.175,
label = "more improper"
)
# shift vs epsilon
set.seed(42)
plot_data |>
slice_sample(prop = 0.2) |> # 3 SBS metrics x 10 sample sizes (n) x 10000 simulations per n
ggplot(aes(x = epsilon, y = shift, color = factor(n))) +
geom_point(alpha = 0.5, size = 0.1) +
facet_wrap(~metric, scales = "fixed") +
scale_color_discrete(
name = "Sample size (n)",
) +
scale_x_continuous(
limits = c(0, 0.11),
breaks = c(0, 0.025, 0.05, 0.075, 0.1)
) +
labs(
x = expression(S[Y](t[max]) ~ "\u00D7" ~ S[C](t[max])),
y = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052"))), # shift from optimal prediction
color = "Sample size (n)"
) +
theme_bw(base_size = 14, base_family = "Arial") +
#theme(axis.text.x = element_text(angle = 35, hjust = 1)) +
guides(color = guide_legend(override.aes = list(size = 4))) +
geom_segment(
data = ann_df,
aes(x = x, y = y, xend = xend, yend = yend),
inherit.aes = FALSE,linewidth = 1,
arrow = arrow(length = unit(0.25, "cm"))
) +
geom_text(
data = ann_df,
aes(x = x + 0.01, y = y - 0.05, label = label),
inherit.aes = FALSE,
angle = 90,
size = 6,
fontface = "bold"
)
#ggsave(filename = "epsilon_vs_bias.png", dpi = 300, height = 5, width = 10, units = "in")plot_data |>
group_by(n) |>
summarise(
mean_epsilon = mean(epsilon)
) |>
DT::datatable(
rownames = FALSE,
options = list(searching = FALSE)
) |>
formatRound(columns = 2, digits = 4)set.seed(42)
plot_data |>
slice_sample(prop = 0.2) |> # 3 SBS metrics x 10 sample sizes (n) x 10000 simulations per n
ggplot(aes(x = S_Y, y = shift, color = factor(n))) +
geom_point(alpha = 0.5, size = 0.3) +
facet_wrap(~metric, scales = "fixed") +
scale_color_discrete(name = "Sample size (n)") +
scale_x_continuous(
limits = c(0, 1),
breaks = c(0, 0.25, 0.5, 0.75, 1)
) +
labs(
x = expression(S[Y](tau^ ~ symbol("\052"))),
y = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052"))),
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines")) +
guides(color = guide_legend(override.aes = list(size = 4))) +
geom_segment(
data = ann_df,
aes(x = x, y = y, xend = xend, yend = yend),
inherit.aes = FALSE,linewidth = 1,
arrow = arrow(length = unit(0.25, "cm"))
) +
geom_text(
data = ann_df,
aes(x = x + 0.1, y = y - 0.05, label = label),
inherit.aes = FALSE,
angle = 90,
size = 6,
fontface = "bold"
)
set.seed(42)
plot_data |>
slice_sample(prop = 0.2) |> # 3 SBS metrics x 10 sample sizes (n) x 10000 simulations per n
ggplot(aes(x = S_C, y = shift, color = factor(n))) +
geom_point(alpha = 0.5, size = 0.3) +
facet_wrap(~metric, scales = "fixed") +
scale_color_discrete(name = "Sample size (n)") +
scale_x_continuous(
limits = c(0, 1),
breaks = c(0, 0.25, 0.5, 0.75, 1)
) +
labs(
x = expression(S[C](tau^ ~ symbol("\052"))),
y = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052"))),
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines")) +
guides(color = guide_legend(override.aes = list(size = 4))) +
geom_segment(
data = ann_df,
aes(x = x, y = y, xend = xend, yend = yend),
inherit.aes = FALSE,linewidth = 1,
arrow = arrow(length = unit(0.25, "cm"))
) +
geom_text(
data = ann_df,
aes(x = x + 0.1, y = y - 0.05, label = label),
inherit.aes = FALSE,
angle = 90,
size = 6,
fontface = "bold"
)
temp_data = plot_data |>
filter(n == 100) |> # epsilon ~ 0.01
mutate(approx_term = (1 - S_Y)/S_C) |>
# Bin S_Y and S_C
mutate(
S_Y_bin = cut(
S_Y,
#breaks = quantile(S_C, probs = seq(0, 1, 0.25), na.rm = TRUE),
breaks = seq(0, 1, by = 0.2),
include.lowest = TRUE,
labels = c("[0,0.2)", "[0.2,0.4)", "[0.4,0.6)", "[0.6,0.8)", "[0.8,1]")
),
S_C_bin = cut(
S_C,
breaks = seq(0, 1, by = 0.2),
include.lowest = TRUE,
labels = c("[0,0.2)", "[0.2,0.4)", "[0.4,0.6)", "[0.6,0.8)", "[0.8,1]")
)
)
ann_df = data.frame(
metric = unique(temp_data$metric)[1],
x = 0.03,
y = -0.01,
xend = 0.03,
yend = -0.03,
label = "more improper"
)
temp_data |>
ggplot(aes(x = approx_term, y = shift, color = S_Y_bin)) +
geom_point(alpha = 0.4, size = 0.2) +
facet_wrap(~metric, scales = "fixed") +
scale_color_manual(
values = c(
"#2166AC", # dark blue
"#67A9CF", # light blue
"#BDBDBD", # grey
"#EF8A62", # light red
"#B2182B" # dark red
),
name = expression(S[Y](tau^ ~ symbol("\052")))
) +
labs(
x = expression((1 - S[Y](tau^ ~ symbol("\052")))/S[C](tau^ ~ symbol("\052"))),
y = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052")))
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines")) +
guides(color = guide_legend(override.aes = list(size = 8))) +
geom_segment(
data = ann_df,
aes(x = x, y = y, xend = xend, yend = yend),
inherit.aes = FALSE,linewidth = 1,
arrow = arrow(length = unit(0.25, "cm"))
) +
geom_text(
data = ann_df,
aes(x = x + 0.2, y = y - 0.01, label = label),
inherit.aes = FALSE,
angle = 90,
size = 6,
fontface = "bold"
)
temp_data |>
ggplot(aes(x = approx_term, y = shift, color = S_C_bin)) +
geom_point(alpha = 0.4, size = 0.2) +
facet_wrap(~metric, scales = "fixed") +
scale_color_manual(
values = c(
"#2166AC", # dark blue
"#67A9CF", # light blue
"#BDBDBD", # grey
"#EF8A62", # light red
"#B2182B" # dark red
),
name = expression(S[C](tau^ ~ symbol("\052")))
) +
labs(
x = expression((1 - S[Y](tau^ ~ symbol("\052")))/S[C](tau^ ~ symbol("\052"))),
y = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052")))
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines")) +
guides(color = guide_legend(override.aes = list(size = 8))) +
geom_segment(
data = ann_df,
aes(x = x, y = y, xend = xend, yend = yend),
inherit.aes = FALSE,linewidth = 1,
arrow = arrow(length = unit(0.25, "cm"))
) +
geom_text(
data = ann_df,
aes(x = x + 0.2, y = y - 0.01, label = label),
inherit.aes = FALSE,
angle = 90,
size = 6,
fontface = "bold"
)
base_data = plot_data |>
dplyr::filter(n == 50)
interp_shift = function(df, nx = 150, ny = 150) {
if (nrow(df) == 0) return(NULL)
interp_res = with(df,
akima::interp(x = S_C, y = S_Y, z = shift, nx = nx, ny = ny, duplicate = "mean")
)
expand.grid(S_C = interp_res$x, S_Y = interp_res$y) |>
mutate(shift = as.vector(interp_res$z),
metric = unique(df$metric))
}
interp_list = base_data |>
dplyr::group_split(metric, .keep = TRUE) |>
purrr::map(.f = interp_shift)
surface_data = bind_rows(interp_list) |>
dplyr::filter(!is.na(shift))
range_shift = range(surface_data$shift, na.rm = TRUE)
min_shift = range_shift[1] # most negative
max_shift = range_shift[2] # closest to zero
ggplot(surface_data, aes(x = S_C, y = S_Y, fill = shift)) +
geom_tile() +
facet_wrap(~ metric) +
scale_fill_gradientn(
colours = c("#B2182B", "grey80", "#2166AC"),
values = scales::rescale(c(min_shift, (min_shift + max_shift)/2, max_shift)),
limits = c(min_shift, 0), # Extend upper limit to 0
name = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052")))
) +
labs(
x = expression(S[C](tau^ ~ symbol("\052"))),
y = expression(S[Y](tau^ ~ symbol("\052")))
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines"))
#ggsave(filename = "bias_surface_n50.png", dpi = 300, height = 5, width = 10, units = "in")base_data2 = plot_data |>
dplyr::filter(n == 100)
interp_list2 = base_data2 |>
dplyr::group_split(metric, .keep = TRUE) |>
purrr::map(.f = interp_shift)
surface_data2 = bind_rows(interp_list2) |>
dplyr::filter(!is.na(shift))
range_shift2 = range(surface_data2$shift, na.rm = TRUE)
min_shift = range_shift2[1] # most negative
max_shift = range_shift2[2] # closest to zero
ggplot(surface_data2, aes(x = S_C, y = S_Y, fill = shift)) +
geom_tile() +
facet_wrap(~ metric) +
scale_fill_gradientn(
colours = c("#B2182B", "grey80", "#2166AC"),
values = scales::rescale(c(min_shift, (min_shift + max_shift)/2, max_shift)),
limits = c(min_shift, 0), # Extend upper limit to 0
name = expression(x^"*" ~ "-" ~ S[Y](tau^ ~ symbol("\052")))
) +
labs(
x = expression(S[C](tau^ ~ symbol("\052"))),
y = expression(S[Y](tau^ ~ symbol("\052")))
) +
theme_bw(base_size = 14) +
theme(panel.spacing = unit(1, "lines"))
#ggsave(filename = "sbs_shift_surface_n100.png", dpi = 300, height = 5, width = 10, units = "in")The only difference with the previous experiment is that S_C(t) is now being estimated using the marginal Kaplan-Meier model via survival::survfit() instead of using the true Weibull censoring distribution (see helper.R). For SBS/ISBS we use constant interpolation of the censoring survival distribution S_C(t). For RCLL* we use linear interpolation of S_C(t) to mitigate density estimation issues, i.e. for f_C(t).
Use properness_test.R to run the experiments. To run for different sample sizes n, use run_tests.sh, changing estimate_cens to TRUE. Merged simulation results for all n are available here.
Load results (same output columns as in the previous section):
res = readRDS("results/res_sims10000_distrs1000_1.rds") |>
select(!matches("shape|scale|prop_cens|tv_dist|n_events|max_t|m_true|m_pred")) # remove columnsAs before, we compute 95% confidence intervals for the score differences across m = 1000 draws per simulation:
measures = c("SBS_q10", "SBS_median", "SBS_q90", "ISBS", "RCLL", "wRCLL")
n_distrs = 1000 # `m` value from paper's experiment
tcrit = qt(0.975, df = n_distrs - 1) # 95% t-test
threshold = 1e-4
data = measures |>
lapply(function(m) {
mean_diff_col = paste0(m, "_diff")
mean_sd_col = paste0(m, "_sd")
res |>
select(sim, n, !!mean_diff_col, !!mean_sd_col, starts_with("shift_"),
epsilon, starts_with("S_")) |>
select(-ends_with("tail")) |>
mutate(
se = .data[[mean_sd_col]] / sqrt(n_distrs),
CI_lower = .data[[mean_diff_col]] - tcrit * se,
CI_upper = .data[[mean_diff_col]] + tcrit * se,
signif_violation = .data[[mean_diff_col]] > threshold & CI_lower > 0,
# Keep relevant SBS columns, just in case
shift_q10 = if (m == "SBS_q10") shift_q10 else NA_real_,
S_Y_q10 = if (m == "SBS_q10") S_Y_q10 else NA_real_,
S_C_q10 = if (m == "SBS_q10") S_C_q10 else NA_real_,
shift_med = if (m == "SBS_median") shift_med else NA_real_,
S_Y_med = if (m == "SBS_median") S_Y_med else NA_real_,
S_C_med = if (m == "SBS_median") S_C_med else NA_real_,
shift_q90 = if (m == "SBS_q90") shift_q90 else NA_real_,
S_Y_q90 = if (m == "SBS_q90") S_Y_q90 else NA_real_,
S_C_q90 = if (m == "SBS_q90") S_C_q90 else NA_real_
) |>
select(!se) |>
mutate(metric = !!m) |>
relocate(metric, .after = 1) |>
rename(diff = !!mean_diff_col,
sd = !!mean_sd_col)
}) |>
bind_rows()
data$metric = factor(
data$metric,
levels = measures,
labels = c("SBS (Q10)", "SBS (Median)", "SBS (Q90)", "ISBS", "RCLL", "RCLL*")
)Lastly, we summarize the significant violations across sample sizes and scoring rules:
all_stats =
data |>
group_by(n, metric) |>
summarize(
n_violations = sum(signif_violation),
violation_rate = mean(signif_violation),
diff_mean = if (any(signif_violation)) mean(diff[signif_violation]) else NA_real_,
diff_median = if (any(signif_violation)) median(diff[signif_violation]) else NA_real_,
diff_min = if (any(signif_violation)) min(diff[signif_violation]) else NA_real_,
diff_max = if (any(signif_violation)) max(diff[signif_violation]) else NA_real_,
.groups = "drop"
)
all_stats |>
arrange(metric) |>
DT::datatable(
rownames = FALSE,
options = list(searching = FALSE),
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: center; font-size:150%",
"Table D1: Empirical violations of properness using G(t)")
) |>
DT::formatRound(columns = 4:8, digits = 5)Estimating S_C(t) via Kaplan-Meier produced nearly identical results to using the true distribution, with no violations for RCLL/RCLL* and only small-sample effects for SBS and ISBS.
To run the experiment use real_data_benchmark.R.
We assess the performance using three survival scoring rules (RCLL, RCLL*, ISBS), D-calibration and Harrell’s C-index on real-world survival datasets using the mlr3proba framework, tested with 10 repetitions of 5-fold cross-validation on the following learners:
Below we show the aggregated performance metrics across all tasks and learners:
res = readRDS("results/real_data_bm.rds")
res |>
dplyr::mutate(
learner_id = dplyr::recode(
learner_id,
"surv.kaplan" = "KM",
"surv.ranger" = "RSF",
"surv.coxph" = "CoxPH"
)
)|>
DT::datatable(rownames = FALSE, filter = "top", options = list(searching = TRUE)) |>
DT::formatRound(columns = 3:7, digits = 3) |>
DT::formatSignif(columns = 6, digits = 3) # scientific notation for surv.dcalibOur experiment demonstrates substantial divergences in model assessment depending on the chosen metric (especially in the case of the scoring rules).
For instance, on datasets such as veteran and whas, which exhibit non-proportional hazards, RSF is more suitable to model non-linear effects compared to the CoxPH. In these cases, RCLL* identifies RSF as superior, whereas RCLL and ISBS still prefer CoxPH — suggesting potential practical limitations of the latter metrics in such settings.
Additionally, RCLL* appears to better capture both discrimination and calibration. In the gbcs dataset, CoxPH achieves the best RCLL, ISBS and C-index scores, yet shows extremely poor calibration (\text{D-calib} \approx 15.7), while RCLL* ranks Kaplan-Meier as the best — possibly reflecting better-calibrated risk estimates. This discrepancy becomes even more pronounced in the lung dataset, where CoxPH again yields high RCLL/C-index/ISBS performance despite severe calibration failure (\text{D-calib} \approx 23.5), whereas RCLL* selects RSF instead.
These findings highlight a critical need for further research: not all metrics labeled as “proper” in theory behave equally in practice.
utils::sessionInfo()R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_DK.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Oslo
tzcode source: system (glibc)
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] future_1.69.0 mlr3tuning_1.5.0
[3] paradox_1.0.1 mlr3extralearners_1.4.0.9000
[5] R6_2.6.1 circlize_0.4.16
[7] ComplexHeatmap_2.21.1 ggpubr_0.6.2
[9] mlr3proba_0.8.6 mlr3_1.3.0
[11] lubridate_1.9.3 forcats_1.0.0
[13] stringr_1.5.1 readr_2.1.5
[15] tidyverse_2.0.0 DT_0.33
[17] tidyr_1.3.1 purrr_1.0.2
[19] akima_0.6-3.6 patchwork_1.3.2
[21] ggplot2_4.0.1 tibble_3.3.0
[23] dplyr_1.1.4
loaded via a namespace (and not attached):
[1] rlang_1.1.7 magrittr_2.0.3 clue_0.3-66
[4] GetoptLong_1.0.5 matrixStats_1.4.1 compiler_4.4.2
[7] callr_3.7.6 png_0.1-8 vctrs_0.6.5
[10] reshape2_1.4.4 shape_1.4.6.1 pkgconfig_2.0.3
[13] crayon_1.5.3 fastmap_1.2.0 backports_1.5.0
[16] labeling_0.4.3 rmarkdown_2.29 tzdb_0.4.0
[19] ps_1.9.1 bit_4.5.0 torch_0.16.3
[22] xfun_0.49 cachem_1.1.0 mlr3misc_0.19.0
[25] jsonlite_2.0.0 survdistr_0.0.1 uuid_1.2-1
[28] cluster_2.1.6 broom_1.0.7 parallel_4.4.2
[31] bslib_0.8.0 stringi_1.8.4 RColorBrewer_1.1-3
[34] parallelly_1.46.1 car_3.1-3 jquerylib_0.1.4
[37] Rcpp_1.1.1 iterators_1.0.14 knitr_1.49
[40] IRanges_2.40.1 Matrix_1.7-1 splines_4.4.2
[43] timechange_0.3.0 tidyselect_1.2.1 abind_1.4-8
[46] yaml_2.3.10 doParallel_1.0.17 codetools_0.2-20
[49] processx_3.8.6 param6_0.2.4 listenv_0.10.0
[52] lattice_0.22-6 plyr_1.8.9 withr_3.0.2
[55] S7_0.2.1 evaluate_1.0.5 survival_3.7-0
[58] pillar_1.11.0 carData_3.0-5 checkmate_2.3.3
[61] foreach_1.5.2 stats4_4.4.2 generics_0.1.3
[64] bbotk_1.8.0 sp_2.1-4 hms_1.1.3
[67] S4Vectors_0.44.0 scales_1.4.0 coro_1.1.0
[70] dictionar6_0.1.3 globals_0.18.0 distr6_1.8.4
[73] glue_1.8.0 tools_4.4.2 mlr3pipelines_0.10.0
[76] data.table_1.18.2.1 ggsignif_0.6.4 set6_0.2.6
[79] crosstalk_1.2.1 colorspace_2.1-1 palmerpenguins_0.1.1
[82] Formula_1.2-5 cli_3.6.5 viridisLite_0.4.2
[85] ooplah_0.2.0 gtable_0.3.6 rstatix_0.7.2
[88] sass_0.4.9 digest_0.6.39 BiocGenerics_0.52.0
[91] mlr3cmprsk_0.0.1 lgr_0.5.0 rjson_0.2.23
[94] htmlwidgets_1.6.4 farver_2.1.2 htmltools_0.5.8.1
[97] lifecycle_1.0.4 GlobalOptions_0.1.2 bit64_4.5.2