Daily Weather Aggregation • windcut

Run this setup first. It loads the package, dplyr, ggplot2, tidyr, and cowplot.

library(windcut)
library(dplyr)
library(ggplot2)
library(tidyr)
library(cowplot)

Why aggregate to daily data?

Window-pane analysis can be done with hourly or daily data. For many plant disease workflows, daily data are easier to inspect and easier to explain: temperature can be represented by daily mean, minimum, or maximum; rainfall by daily total; and leaf wetness by daily wet hours or daily proportion.

The important point is that the aggregation is an analysis decision. windcut does not need to hide it. Use aggregate_weather_daily() when your source data are hourly or sub-daily and you want to define the daily values before creating candidate windows.

Start from hourly weather

The bundled window-pane demo contains both daily weather and the hourly source weather. The main workflow tutorials use the daily table, but this tutorial starts from weather_hourly to show how that table can be created.

data(window_pane_demo_data)

hourly_weather <- window_pane_demo_data$weather_hourly

hourly_weather %>%
  filter(site_id == site_id[1]) %>%
  slice_head(n = 8) %>%
  knitr::kable()

site_id	time	temp	rh	rain	leaf_wetness
S01	2023-12-01 00:00:00	20.33	87.94	0.00	0
S01	2023-12-01 01:00:00	23.99	89.97	0.00	0
S01	2023-12-01 02:00:00	26.29	91.84	0.00	1
S01	2023-12-01 03:00:00	25.81	92.69	2.75	1
S01	2023-12-01 04:00:00	26.30	80.51	0.00	0
S01	2023-12-01 05:00:00	27.82	80.61	0.00	0
S01	2023-12-01 06:00:00	28.50	80.42	0.00	0
S01	2023-12-01 07:00:00	27.57	72.62	0.00	0

Use the default daily summaries

The default daily aggregation keeps the usual windcut weather-column names. Temperature and relative humidity are averaged, rainfall is summed, and leaf-wetness observations are summed. With hourly source data, the daily leaf-wetness sum can be interpreted as wet hours. The output column names always show the time scale, statistic, and variable, such as daily_mean_temp and daily_sum_rain.

daily_weather <- aggregate_weather_daily(
  weather = hourly_weather,
  id_col = "site_id"
)

daily_weather %>%
  filter(site_id == site_id[1]) %>%
  slice_head(n = 8) %>%
  knitr::kable()

site_id	date	time	daily_mean_temp	daily_mean_rh	daily_sum_rain	daily_sum_leaf_wetness
S01	2023-12-01	2023-12-01	22.33292	80.61750	7.15	6
S01	2023-12-02	2023-12-02	22.67500	78.64167	0.85	4
S01	2023-12-03	2023-12-03	23.35333	79.42042	3.61	7
S01	2023-12-04	2023-12-04	23.39167	77.86875	0.00	3
S01	2023-12-05	2023-12-05	23.13667	78.99000	6.59	6
S01	2023-12-06	2023-12-06	22.63375	80.20750	0.86	6
S01	2023-12-07	2023-12-07	22.24750	79.78875	2.11	7
S01	2023-12-08	2023-12-08	21.59917	80.14833	0.00	4

The result has one row per site per day. The time column is placed at midnight so the daily table can be used directly with make_windows() and window_pane(unit = "days").

daily_weather %>%
  count(site_id, name = "n_days") %>%
  knitr::kable()

site_id	n_days
S01	180
S02	180
S03	180
S04	180
S05	180
S06	180
S07	180
S08	180
S09	180
S10	180

Choose the time and date columns

Many datasets have a timestamp column such as time, datetime, or timestamp. If the data do not already have a date column, aggregate_weather_daily() derives the day from time_col and writes it to date_col.

timestamp_weather <- hourly_weather %>%
  rename(timestamp = time)

daily_from_timestamp <- aggregate_weather_daily(
  weather = timestamp_weather,
  id_col = "site_id",
  time_col = "timestamp",
  date_col = "weather_date"
)

daily_from_timestamp %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, weather_date, timestamp, daily_mean_temp, daily_mean_rh, daily_sum_rain) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	weather_date	timestamp	daily_mean_temp	daily_mean_rh	daily_sum_rain
S01	2023-12-01	2023-12-01	22.33292	80.61750	7.15
S01	2023-12-02	2023-12-02	22.67500	78.64167	0.85
S01	2023-12-03	2023-12-03	23.35333	79.42042	3.61
S01	2023-12-04	2023-12-04	23.39167	77.86875	0.00
S01	2023-12-05	2023-12-05	23.13667	78.99000	6.59
S01	2023-12-06	2023-12-06	22.63375	80.20750	0.86

Some field datasets already have both a timestamp column and a day column. In that case, set date_col to the existing day column. The function will use that column for grouping and still create a daily timestamp column if keep_time = TRUE.

weather_with_day <- hourly_weather %>%
  mutate(weather_day = as.Date(time))

daily_from_existing_day <- aggregate_weather_daily(
  weather = weather_with_day,
  id_col = "site_id",
  time_col = "time",
  date_col = "weather_day"
)

daily_from_existing_day %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, weather_day, time, daily_mean_temp, daily_mean_rh, daily_sum_rain) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	weather_day	time	daily_mean_temp	daily_mean_rh	daily_sum_rain
S01	2023-12-01	2023-12-01	22.33292	80.61750	7.15
S01	2023-12-02	2023-12-02	22.67500	78.64167	0.85
S01	2023-12-03	2023-12-03	23.35333	79.42042	3.61
S01	2023-12-04	2023-12-04	23.39167	77.86875	0.00
S01	2023-12-05	2023-12-05	23.13667	78.99000	6.59
S01	2023-12-06	2023-12-06	22.63375	80.20750	0.86

If the input is already daily and only has a date column, use time_col = NULL and keep_time = FALSE. This is useful when you want to redefine daily statistics from repeated daily records or harmonize an already daily table without creating a timestamp column.

already_dated <- hourly_weather %>%
  mutate(weather_day = as.Date(time)) %>%
  select(site_id, weather_day, temp, rain)

daily_from_date_only <- aggregate_weather_daily(
  weather = already_dated,
  id_col = "site_id",
  time_col = NULL,
  date_col = "weather_day",
  weather_cols = c("temp", "rain"),
  statistics = list(temp = "mean", rain = "sum"),
  keep_time = FALSE
)

daily_from_date_only %>%
  filter(site_id == site_id[1]) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	weather_day	daily_mean_temp	daily_sum_rain
S01	2023-12-01	22.33292	7.15
S01	2023-12-02	22.67500	0.85
S01	2023-12-03	23.35333	3.61
S01	2023-12-04	23.39167	0.00
S01	2023-12-05	23.13667	6.59
S01	2023-12-06	22.63375	0.86

Choose different daily statistics

The statistics argument uses the same idea as the window functions. A character vector applies the same statistics to every selected weather variable. This is compact when the same summaries make sense for all variables.

daily_many_stats <- aggregate_weather_daily(
  weather = hourly_weather,
  id_col = "site_id",
  statistics = c("mean", "min", "max")
)

daily_many_stats %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, date, time, daily_mean_temp, daily_min_temp, daily_max_temp, daily_mean_rh, daily_min_rh, daily_max_rh) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	date	time	daily_mean_temp	daily_min_temp	daily_max_temp	daily_mean_rh	daily_min_rh	daily_max_rh
S01	2023-12-01	2023-12-01	22.33292	16.32	28.50	80.61750	60.69	93.53
S01	2023-12-02	2023-12-02	22.67500	15.21	30.27	78.64167	63.32	96.74
S01	2023-12-03	2023-12-03	23.35333	16.95	30.88	79.42042	60.37	96.08
S01	2023-12-04	2023-12-04	23.39167	16.26	30.54	77.86875	59.08	91.41
S01	2023-12-05	2023-12-05	23.13667	15.71	29.64	78.99000	62.51	98.36
S01	2023-12-06	2023-12-06	22.63375	16.61	29.18	80.20750	63.50	97.18

A named list is better when each weather variable needs its own daily definition. In this example, temperature gets mean, minimum, and maximum; relative humidity gets mean and median; rainfall gets total and maximum hourly rainfall; and leaf wetness gets wet hours.

daily_selected_stats <- aggregate_weather_daily(
  weather = hourly_weather,
  id_col = "site_id",
  statistics = list(
    temp = c("mean", "min", "max"),
    rh = c("mean", "median"),
    rain = c("sum", "max"),
    leaf_wetness = list(wet_hours = "sum")
  )
)

daily_selected_stats %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, date, starts_with("daily_mean_temp"), starts_with("daily_sum_rain"), daily_wet_hours_leaf_wetness) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	date	daily_mean_temp	daily_sum_rain	daily_wet_hours_leaf_wetness
S01	2023-12-01	22.33292	7.15	6
S01	2023-12-02	22.67500	0.85	4
S01	2023-12-03	23.35333	3.61	7
S01	2023-12-04	23.39167	0.00	3
S01	2023-12-05	23.13667	6.59	6
S01	2023-12-06	22.63375	0.86	6

Summarize daily biological conditions

Daily aggregation can also count multivariable conditions within each day. This is useful when hourly records are available and the biological question is about how many hours per day were favorable. The example below counts warm and humid hours using .conditions.

daily_condition_stats <- aggregate_weather_daily(
  weather = hourly_weather,
  id_col = "site_id",
  statistics = list(
    temp = "mean",
    rh = "mean",
    .conditions = list(
      favorable_hours = count_when(temp >= 18 & temp <= 26 & rh >= 90)
    )
  )
)

daily_condition_stats %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, date, daily_mean_temp, daily_mean_rh, daily_favorable_hours) %>%
  slice_head(n = 8) %>%
  knitr::kable()

site_id	date	daily_mean_temp	daily_mean_rh	daily_favorable_hours
S01	2023-12-01	22.33292	80.61750	3
S01	2023-12-02	22.67500	78.64167	2
S01	2023-12-03	23.35333	79.42042	5
S01	2023-12-04	23.39167	77.86875	3
S01	2023-12-05	23.13667	78.99000	2
S01	2023-12-06	22.63375	80.20750	3
S01	2023-12-07	22.24750	79.78875	4
S01	2023-12-08	21.59917	80.14833	2

The plot below checks whether the condition behaves as expected through time. This is a useful quality-control step before the daily table is used in window-pane feature generation.

daily_condition_stats %>%
  filter(site_id == site_id[1]) %>%
  ggplot(aes(date, daily_favorable_hours)) +
  geom_col(fill = "#3f7d58", width = 0.75) +
  labs(
    title = "Daily favorable-hour counts from hourly records",
    x = NULL,
    y = "Favorable hours per day"
  ) +
  cowplot::theme_half_open()

ggplot2 chart showing daily weather aggregation.

Use non-standard weather-column names

Field datasets often use names such as temp2m, prectot, or sradiation. Use weather_cols to tell windcut which columns should be aggregated. The simplest approach is to pass the real column names and use those same names in statistics.

custom_hourly <- hourly_weather %>%
  rename(
    temp2m = temp,
    relhum = rh,
    prectot = rain
  ) %>%
  mutate(sradiation = pmax(0, 600 * sin(as.numeric(format(time, "%H")) / 24 * pi)))

custom_daily <- aggregate_weather_daily(
  weather = custom_hourly,
  id_col = "site_id",
  weather_cols = c("temp2m", "relhum", "prectot", "sradiation"),
  statistics = list(
    temp2m = c("mean", "max"),
    relhum = "mean",
    prectot = "sum",
    sradiation = "sum"
  )
)

custom_daily %>%
  filter(site_id == site_id[1]) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	date	time	daily_mean_temp2m	daily_max_temp2m	daily_mean_relhum	daily_sum_prectot	daily_sum_sradiation
S01	2023-12-01	2023-12-01	22.33292	28.50	80.61750	7.15	9154.231
S01	2023-12-02	2023-12-02	22.67500	30.27	78.64167	0.85	9154.231
S01	2023-12-03	2023-12-03	23.35333	30.88	79.42042	3.61	9154.231
S01	2023-12-04	2023-12-04	23.39167	30.54	77.86875	0.00	9154.231
S01	2023-12-05	2023-12-05	23.13667	29.64	78.99000	6.59	9154.231
S01	2023-12-06	2023-12-06	22.63375	29.18	80.20750	0.86	9154.231

If shorter output names are useful later, use a named weather_cols vector. The names on the left become the names used in statistics and in the output columns; the names on the right are the columns in the original dataset.

custom_daily_short_names <- aggregate_weather_daily(
  weather = custom_hourly,
  id_col = "site_id",
  weather_cols = c(
    air_temp = "temp2m",
    humidity = "relhum",
    rain = "prectot",
    solar = "sradiation"
  ),
  statistics = list(
    air_temp = c("mean", "max"),
    humidity = "mean",
    rain = "sum",
    solar = "sum"
  )
)

custom_daily_short_names %>%
  filter(site_id == site_id[1]) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	date	time	daily_mean_air_temp	daily_max_air_temp	daily_mean_humidity	daily_sum_rain	daily_sum_solar
S01	2023-12-01	2023-12-01	22.33292	28.50	80.61750	7.15	9154.231
S01	2023-12-02	2023-12-02	22.67500	30.27	78.64167	0.85	9154.231
S01	2023-12-03	2023-12-03	23.35333	30.88	79.42042	3.61	9154.231
S01	2023-12-04	2023-12-04	23.39167	30.54	77.86875	0.00	9154.231
S01	2023-12-05	2023-12-05	23.13667	29.64	78.99000	6.59	9154.231
S01	2023-12-06	2023-12-06	22.63375	29.18	80.20750	0.86	9154.231

Check the daily signal

After aggregation, plot the daily weather before moving to window generation. This is a simple way to catch unit mistakes, unexpected missing periods, or daily statistics that do not match the biology of the disease.

plot_daily <- daily_weather %>%
  filter(site_id == site_id[1]) %>%
  select(site_id, date, daily_mean_temp, daily_mean_rh, daily_sum_rain, daily_sum_leaf_wetness) %>%
  pivot_longer(
    cols = c(daily_mean_temp, daily_mean_rh, daily_sum_rain, daily_sum_leaf_wetness),
    names_to = "variable",
    values_to = "value"
  )

ggplot(plot_daily, aes(date, value)) +
  geom_line(color = "#3f7d58", linewidth = 0.7) +
  facet_wrap(~ variable, scales = "free_y", ncol = 1) +
  labs(
    title = "Daily weather after aggregation",
    x = NULL,
    y = NULL
  ) +
  cowplot::theme_half_open()

ggplot2 chart showing daily weather aggregation.

Use the daily data in the window-pane workflow

The daily table can be passed directly to window_pane(). Negative offsets create windows before the reference date, positive offsets create windows after the reference date, and mixed offsets create windows around the reference date.

windows <- make_windows(
  min_offset = -21,
  max_offset = -1,
  width = 5,
  reference_col = "assessment_time"
)

features <- window_pane(
  weather = daily_weather,
  assessments = window_pane_demo_data$assessments,
  windows = windows,
  id_col = "site_id",
  response_col = "disease_intensity",
  weather_cols = c(
    temp = "daily_mean_temp",
    rh = "daily_mean_rh",
    rain = "daily_sum_rain",
    leaf_wetness = "daily_sum_leaf_wetness"
  ),
  unit = "days"
)

features %>%
  select(1:10) %>%
  slice_head(n = 6) %>%
  knitr::kable()

site_id	assessment_time	disease_intensity	n_obs_window_m21_m16	temp_mean_window_m21_m16	temp_min_window_m21_m16	temp_max_window_m21_m16	rh_mean_window_m21_m16	rain_sum_window_m21_m16	leaf_wetness_sum_window_m21_m16
S01	2024-05-18	75.2	5	20.69825	20.52250	21.05250	80.33983	14.94	37
S02	2024-05-07	59.2	5	21.67725	20.61042	22.44708	80.63125	14.29	31
S03	2024-05-20	53.9	5	21.23808	20.29917	22.29500	80.50125	28.07	39
S04	2024-04-12	71.7	5	22.91025	22.33125	23.15583	79.52167	2.95	26
S05	2024-04-29	80.9	5	22.81483	21.84917	23.47375	79.82450	9.89	21
S06	2024-04-15	87.2	5	22.69733	21.65542	23.59667	80.00858	16.77	36

This separation keeps the workflow explicit: first define what one day means, then define where the candidate windows sit relative to the biological reference date.

site_id	n_days
S01	180
S02	180
S03	180
S04	180
S05	180
S06	180
S07	180
S08	180
S09	180
S10	180

site_id	n_days
S01	180
S02	180
S03	180
S04	180
S05	180
S06	180
S07	180
S08	180
S09	180
S10	180

site_id	n_days
S01	180
S02	180
S03	180
S04	180
S05	180
S06	180
S07	180
S08	180
S09	180
S10	180