Skip to contents

Run this setup first. It loads the packages used in the article and sets figure options used by the screening examples.

library(windcut)
library(dplyr)
library(ggplot2)
library(tidyr)
library(cowplot)

From windows to candidate predictors

Window-pane analysis can create many weather predictors. The next task is to decide which ones deserve attention before modeling. This article uses two screening functions:

  • screen_window_features() ranks each candidate predictor by its correlation with the disease response.
  • screen_feature_correlations() compares candidate predictors against each other and suggests a less redundant subset.

Important. Screening is not model validation. It is only an first pass for reducing a large feature space before fitting predictive models.

Build a feature table

Before screening, we need one row per site and one column per candidate predictor. The bundled window_pane_demo_data object already contains daily weather summaries and one disease assessment per site. Here the windows end one day before the assessment date, so the disease observation is not mixed with same-day weather.

The daily weather column names are kept exactly as they appear in the data: daily_mean_temp, daily_mean_rh, daily_sum_rain, and daily_sum_leaf_wetness. We also derive daily_vpd from temperature and relative humidity to show that derived variables can be screened in the same workflow. The statistics list uses those column names, so the generated feature names show both the weather variable and the window summary.

data(window_pane_demo_data)

weather <- window_pane_demo_data$weather |>
  derive_vpd(daily_mean_temp, daily_mean_rh, name = "daily_vpd")
assessments <- window_pane_demo_data$assessments

windows <- make_windows(min_offset = -18, max_offset = -1, width = 5, reference_col = "assessment_time")

daily_weather_cols <- c(
  "daily_mean_temp",
  "daily_mean_rh",
  "daily_vpd",
  "daily_sum_rain",
  "daily_sum_leaf_wetness"
)

summary_statistics <- list(
  daily_mean_temp = list(mean = "mean", days_18_26 = count_between(18, 26)),
  daily_mean_rh = list(mean = "mean", humid_days = humid_hours(90)),
  daily_vpd = list(mean = "mean", dry_days = count_above(1.2)),
  daily_sum_rain = list(total = "sum", rain_events = rain_events(0.2)),
  daily_sum_leaf_wetness = list(wet_days = wet_days(0)),
  .conditions = list(
    favorable_days = count_when(
      daily_mean_temp >= 18 & daily_mean_temp <= 26 & daily_mean_rh >= 90
    )
  )
)

features <- window_pane(
  weather = weather,
  assessments = assessments,
  windows = windows,
  id_col = "site_id",
  response_col = "disease_intensity",
  weather_cols = daily_weather_cols,
  statistics = summary_statistics
)

feature_cols <- features %>%
  select(contains("_window_")) %>%
  names()

feature_table_overview <- data.frame(
  rows = nrow(features),
  candidate_feature_columns = length(feature_cols)
)

knitr::kable(feature_table_overview)
rows candidate_feature_columns
10 143
knitr::kable(data.frame(feature = head(feature_cols, 8)))
feature
n_obs_window_m18_m13
daily_mean_temp_mean_window_m18_m13
daily_mean_temp_days_18_26_window_m18_m13
daily_mean_rh_mean_window_m18_m13
daily_mean_rh_humid_days_window_m18_m13
daily_vpd_mean_window_m18_m13
daily_vpd_dry_days_window_m18_m13
daily_sum_rain_total_window_m18_m13

Screen with Spearman correlation

The first question is: which individual weather-window predictors are most associated with disease intensity? screen_window_features() answers that with one correlation test per candidate predictor. Spearman correlation is useful for this example because it ranks monotonic relationships and is less dependent on a linear response shape than Pearson correlation.

screened <- screen_window_features(
  data = features,
  response_col = "disease_intensity",
  feature_cols = feature_cols,
  method = "spearman"
)

knitr::kable(screened %>% slice_head(n = 10))
feature metric window estimate p_value n_complete p_adjusted
daily_mean_rh_mean_window_m10_m05 daily_mean_rh_mean window_m10_m05 -0.6242424 0.0602490 10 0.7946892
daily_sum_rain_total_window_m08_m03 daily_sum_rain_total window_m08_m03 -0.6000000 0.0731199 10 0.7946892
daily_mean_rh_mean_window_m12_m07 daily_mean_rh_mean window_m12_m07 -0.5878788 0.0802151 10 0.7946892
daily_sum_rain_rain_events_window_m09_m04 daily_sum_rain_rain_events window_m09_m04 0.5698029 0.0855044 10 0.7946892
daily_vpd_mean_window_m10_m05 daily_vpd_mean window_m10_m05 0.5636364 0.0957916 10 0.7946892
daily_mean_rh_mean_window_m08_m03 daily_mean_rh_mean window_m08_m03 -0.5151515 0.1328231 10 0.7946892
daily_mean_rh_mean_window_m07_m02 daily_mean_rh_mean window_m07_m02 -0.4909091 0.1544427 10 0.7946892
daily_mean_rh_mean_window_m11_m06 daily_mean_rh_mean window_m11_m06 -0.4787879 0.1660580 10 0.7946892
daily_mean_rh_mean_window_m09_m04 daily_mean_rh_mean window_m09_m04 -0.4787879 0.1660580 10 0.7946892
daily_vpd_mean_window_m08_m03 daily_vpd_mean window_m08_m03 0.4666667 0.1782193 10 0.7946892

Plotting the strongest screened features makes the sign and magnitude of the associations easier to scan. Positive and negative correlations are separated by color because they suggest different biological interpretations: higher values of a predictor may be associated with either higher or lower disease intensity.

top10 <- screened %>%
  mutate(abs_estimate = abs(estimate)) %>%
  arrange(desc(abs_estimate)) %>%
  slice_head(n = 10) %>%
  mutate(feature = factor(feature, levels = rev(feature)))

ggplot(top10, aes(estimate, feature, fill = estimate > 0)) +
  geom_col(show.legend = FALSE) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  scale_fill_manual(values = c("#c47f2c", "#5f8f63")) +
  labs(
    title = "Top feature-screening results",
    x = "Correlation estimate",
    y = NULL
  ) +
  cowplot::theme_half_open()

ggplot2 chart showing the result of this windcut workflow step.

The output provides:

  • estimate: correlation estimate
  • p_value: raw test p-value
  • p_adjusted: corrected p-value
  • metric: parsed metric name
  • window: parsed relative-time window label

The adjusted p-value is useful because many window-derived predictors are being tested at once. It should not be read as final proof that a predictor belongs in a model, but it helps keep the first screen from being driven only by chance.

Reduce correlated candidate features

Many modeling approaches become unstable or difficult to interpret when the predictor set contains strongly correlated variables. Use screen_feature_correlations() to compare candidate features against each other and suggest a less redundant subset.

Here the correlation threshold is set to 0.8, meaning pairs with absolute correlation of at least 0.8 are treated as strongly redundant. The method is user-controlled, so Pearson, Spearman, or Kendall can be chosen for the problem. When a highly correlated pair is found, the function keeps the feature that is less correlated with the rest of the candidate set.

correlation_screen <- screen_feature_correlations(
  data = features,
  feature_cols = feature_cols,
  method = "spearman",
  threshold = 0.8
)

knitr::kable(correlation_screen$high_correlations %>% slice_head(n = 10))
feature_a feature_b correlation abs_correlation
daily_sum_rain_rain_events_window_m12_m07 daily_sum_rain_rain_events_window_m11_m06 1.0000000 1.0000000
daily_sum_rain_rain_events_window_m07_m02 daily_sum_rain_rain_events_window_m06_m01 1.0000000 1.0000000
daily_mean_temp_mean_window_m18_m13 daily_vpd_mean_window_m18_m13 0.9878788 0.9878788
daily_mean_temp_mean_window_m17_m12 daily_vpd_mean_window_m17_m12 0.9878788 0.9878788
daily_mean_temp_mean_window_m16_m11 daily_vpd_mean_window_m16_m11 0.9878788 0.9878788
daily_mean_temp_mean_window_m13_m08 daily_vpd_mean_window_m06_m01 -0.9878788 0.9878788
daily_vpd_mean_window_m13_m08 daily_vpd_mean_window_m06_m01 -0.9878788 0.9878788
daily_mean_temp_mean_window_m06_m01 daily_vpd_mean_window_m06_m01 0.9878788 0.9878788
daily_mean_temp_mean_window_m15_m10 daily_vpd_mean_window_m15_m10 0.9757576 0.9757576
daily_mean_temp_mean_window_m13_m08 daily_vpd_mean_window_m13_m08 0.9757576 0.9757576
knitr::kable(
  data.frame(suggested_feature = correlation_screen$suggested_features) %>%
    slice_head(n = 12)
)
suggested_feature
daily_mean_rh_mean_window_m18_m13
daily_sum_rain_rain_events_window_m18_m13
daily_mean_temp_mean_window_m17_m12
daily_mean_rh_mean_window_m17_m12
daily_sum_rain_total_window_m17_m12
daily_sum_rain_rain_events_window_m17_m12
daily_sum_rain_total_window_m15_m10
daily_sum_rain_rain_events_window_m15_m10
daily_mean_temp_mean_window_m14_m09
daily_sum_rain_rain_events_window_m14_m09
daily_mean_rh_mean_window_m13_m08
daily_sum_rain_rain_events_window_m13_m08

The screening object also makes it easy to quantify how much redundancy was removed. The table and plot below compare the original number of candidate features with the number suggested for modeling and the number set aside under the selected correlation threshold.

feature_reduction <- data.frame(
  status = c("Original features", "Suggested features", "Removed features"),
  count = c(
    length(feature_cols),
    length(correlation_screen$suggested_features),
    length(correlation_screen$removed_features)
  )
)

knitr::kable(feature_reduction)
status count
Original features 143
Suggested features 22
Removed features 121

ggplot(feature_reduction, aes(count, status, fill = status)) +
  geom_col(show.legend = FALSE, width = 0.65) +
  geom_text(aes(label = count), hjust = -0.25, size = 3.8) +
  scale_fill_manual(values = c(
    "Original features" = "#3f7d58",
    "Suggested features" = "#6ea87d",
    "Removed features" = "#c47f2c"
  )) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(
    title = "Feature count after correlation screening",
    x = "Number of feature columns",
    y = NULL
  ) +
  cowplot::theme_half_open()

ggplot2 chart showing the result of this windcut workflow step.

The first plot explains the suggestion mechanism. We plot the most redundant variables first; features with high mean absolute correlation to the rest of the feature set are more likely to be set aside. Green bars are suggested for the next modeling step; orange bars are redundant under the selected threshold.

summary_plot <- correlation_screen$feature_summary %>%
  arrange(desc(mean_abs_correlation)) %>%
  slice_head(n = 30) %>%
  mutate(feature = factor(feature, levels = rev(feature)))

ggplot(summary_plot, aes(mean_abs_correlation, feature, fill = suggested)) +
  geom_col() +
  scale_fill_manual(values = c("#c47f2c", "#5f8f63")) +
  labs(
    title = "Feature redundancy ranking",
    x = "Mean absolute correlation with other features",
    y = NULL,
    fill = "Suggested"
  ) +
  cowplot::theme_half_open()

ggplot2 chart showing the result of this windcut workflow step.

The heatmap complements the ranking plot by showing the pairwise structure. Values close to 1 or -1 indicate features carrying very similar information, while values close to 0 indicate features that are less redundant. To keep labels readable, this plot uses a focused subset of the suggested features.

heatmap_features <- head(correlation_screen$suggested_features, 12)
heatmap_matrix <- correlation_screen$correlation_matrix[
  heatmap_features,
  heatmap_features,
  drop = FALSE
]

heatmap_data <- as.data.frame(as.table(heatmap_matrix)) %>%
  rename(feature_x = Var1, feature_y = Var2, correlation = Freq) %>%
  mutate(
    label = sprintf("%.2f", correlation),
    feature_x = factor(feature_x, levels = heatmap_features),
    feature_y = factor(feature_y, levels = rev(heatmap_features))
  )

ggplot(heatmap_data, aes(feature_x, feature_y, fill = correlation)) +
  geom_tile(color = "white", linewidth = 0.4) +
  geom_text(aes(label = label), size = 2.6) +
  scale_fill_gradient2(
    low = "#4b79a8",
    mid = "white",
    high = "#b13a32",
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  labs(
    title = "Correlation matrix for suggested features",
    x = NULL,
    y = NULL
  ) +
  coord_fixed() +
  cowplot::theme_half_open() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    panel.grid = element_blank()
  )

ggplot2 chart showing the result of this windcut workflow step.

A practical interpretation workflow

  1. Look for metrics that show repeated signal across nearby windows.
  2. Prefer windows that make biological sense, not only statistical sense.
  3. Remove or set aside highly correlated predictors when the modeling approach assumes low feature redundancy.
  4. Avoid overinterpreting isolated peaks when neighboring windows are inconsistent.
  5. Carry a small candidate set into the predictive modeling stage.