We provide code documentation used in the paper, “Dynamic Ensemble Prediction of Cognitive Performance in Spaceflight.” The goal of this project was to predict neurocognitive performance assessed via a brief psychomotor vigilance test (PVT) in astronauts aboard the International Space Station. In particular, the outcome of interest was the LRM-50 metric (Basner et al. 2015), where values of LRM-50 less than 0 correspond to better performance and lower likelihood of being sleep-deprived, and vice versa for LRM-50 greater than 0. Our data consisted of longitudinal observations of astronaut PVT performance, as well as other time-varying predictors measured by the Reaction Self-Test (RST) and PVT, operational variables, environmental stressors, and demographic traits.
To predict LRM-50, we employed an ensemble model where the final predicted outcome was the average of predicted outcomes from the following models:
The ensemble was ultimately selected as it minimized prediction error better than any single model alone.
Note: some directory and Excel column naming conventions may vary based on your operating system. The following code was run on a Mac OS.
# Markdown and Plots
library(pander)
library(plotly)
library(d3heatmap)
library(stargazer)
library(pls)
library(ggfortify)
library(grid)
library(gridExtra)
# Modelling
library(nlme)
library(refund)
library(fcr)
library(randomForest)
# Data
library(fasttime)
library(openxlsx)
library(zoo)
library(xts)
library(lubridate)
library(readxl)
library(purrr)
library(tidyverse)
####### USER DEFINED PATHS ########
# trish.path: folder containing everything, including the following subdirectories:
# dir.path: subfolder of trish.path containing Data
# mod.path.final: subfolder of trish.path containing model predictions/objects
# code.path: subfolder of trish.path containing R scripts, e.g. helpers.R
# Additionally, create folders in dir.path to hold the output
# Directory to save all LOESS/interpolation objects
smooth.path <- file.path(dir.path, "Smoothed Objects")
if (!dir.exists(smooth.path)) dir.create(smooth.path)
# Directory for final deliverable of data and data dictionary
del.path <- file.path(dir.path, "Final Data Deliverable")
if (!dir.exists(del.path)) dir.create(del.path)
# Directory for variable selection
sel.path <- file.path(dir.path, "Variable_Selection")
if (!dir.exists(sel.path)) dir.create(sel.path)
# Directory for prediction assessment
day1.path <- file.path(dir.path, "Prediction_1Day")
if (!dir.exists(day1.path)) dir.create(day1.path)
# Number of cores for parallel processing
n_cores = 8
# Read in variable names and their labels
vars_and_labels <- read.csv(file.path(dir.path, "vars_and_labels.csv"))
# Some helper functions (see Appendix below)
source(file.path(code.path, "helpers.R"))
get_label <- function(varnames){
x = data.frame(Var = varnames) %>%
left_join(vars_and_labels, by = "Var") %>%
mutate_all(as.character) %>%
mutate(Label = ifelse(is.na(Label), Var, Label))
return(as.character(x$Label))
}
Per funding agency requirements, the RST data analyzed in this study were uploaded to NASA’s Life Sciences Data Archive (LSDA; https://lsda.jsc.nasa.gov) and, together with the environmental data, are available upon request from NASA.
Data from multiple sources (RST data, environmental variable time series, demographic and other information) were cleaned, aggregated, and smoothed in preparation for statistical modelling.
First, we read in the raw RST data from the .xlsx
file, which contains 1 row for each participant and instance of a
Reaction Self-Test (RST). We then create new variables (variable names
appearing like this
) that we will need later in the
analysis:
Medication Flags for sleep aids, antihistamines, pain, and decongestants:
Med_SleepAid
, Med_Pain
,
Med_Antihist
, and Med_Decongest
Date Variables to track days since the beginning of the flight:
abs_days
: elapsed days since the first measurement in
the entire dataabs_days_sub
: (person-level) elapsed days since the
first pre-flight measurementAverage pre-flight PVT (person-level) average
overall performance score (OPS) score on the PVT in the pre-flight
period: Pre_PVT
EVA indicator variables if the participant has an EVA that day or the next day:
EVA_Today
and EVA_Tomorrow
Standardized Cognitive Outcomes: for each participant, the PVT and LRM-50 metrics were transformed into \(z\)-scores using the person-specific mean and variance:
LRM_Z
is the standardized LRM-50PVT_Z
is the standardized PVTComputer Location: for each RST instance, we
match the computer location to either Node 2 or US Lab:
Location
# Pre-defined names: sleep aids, painkillers, antihistamines, decongestants
meds_sleep = c("Ambien", "Sonata")
meds_pain = c("Ibuprofen", "Tylenol", "Aspirin", "Naproxen")
meds_antihist = c("Claritin", "Benadryl")
meds_dec = c("Sudafed")
# EVA Variables - EVA Tomorrow
eva_tom <- read_excel(paste(dir.path,'RST_Data.xlsx',sep='/'),
sheet = 5, col_types = c("text", "date", "date", "text", "numeric", "numeric")) %>%
select(ID, EVA_Tomorrow_Date, EVA_Tomorrow, Time_of_Day) %>%
filter(EVA_Tomorrow == 1)
# EVA Variables - EVA Today
eva_tod <- read_excel(paste(dir.path,'RST_Data.xlsx',sep='/'),
sheet = 5, col_types = c("text", "date", "date", "text", "numeric", "numeric")) %>%
select(ID, EVA_Today_Date, EVA_Today, Time_of_Day) %>%
filter(EVA_Today == 1)
# Extract column names from Excel file
rst_raw_names <- read_excel(paste(dir.path,'RST_Data.xlsx',sep='/'),
sheet = 2) %>% slice(2) %>% unlist %>% unname
# Read in the data
rst <- read_excel(paste(dir.path,'RST_Data.xlsx',sep='/'),
sheet = 2, skip = 2,
col_types = c(rep("text", 3),
"date",
rep("text", 136)))
# Format column names so spaces become underscores
rst.clean <- rst %>%
`colnames<-`(gsub('([[:punct:]])|\\s+','_',rst_raw_names))
# Remove columns named "Empty"
remove_cols = which(rst_raw_names=='Empty')
rst.clean <- rst.clean[,-remove_cols] %>%
# Rename the neurobehavioral variables
rename(Errors_of_comission = Erros_of_comission,
Difficulty_Rating = Difficulty_Rating__0_Not__1_Somewhat__2_Very____no_data_,
Interrupted = Interrupted_by_crewmember_ground_control,
Problems_testing_equipment = Problems_with_the_testing_equipment,
# Rename the sleep questionnaire variables
Low_Workload = Workload__Very_high_0_vs_Very_low_10,
Poor_Sleep_Quality = Sleep_Quality__Good_0_vs_Poor_10,
Pre_PVT_Very_Sleepy = Pre_PVT___Not_at_all_0_vs_Very_much_10,
Post_PVT_Very_Sleepy = Post_PVT___Not_at_all_0_vs_Very_much_10,
Physically_Exhausted = Energetic_0_vs_Physically_exhausted_10,
Mentally_Fatigued = Mentally_sharp_0_vs_Mentally_fatigued_10,
Fresh_Ready_to_Go = Tired_0_vs_Fresh__ready_to_go_10,
Very_stressed = Not_stressed_at_all_0_vs_Very_stressed_10) %>%
# Convert test completion time into POSIXct
mutate(Time_of_Test_Completion2 = format(
as.POSIXct(Sys.Date() + as.numeric(Time_of_Test_Completion)),
"%H:%M", tz="UTC"),
# Combine the date and time of day into one variable: obs_time
obs_time = paste(Date_of_Test_Completion, Time_of_Test_Completion2),
obs_time = as.POSIXct(obs_time, tz = "UTC"),
# Truncated day/hour
Time_day = as.POSIXct(trunc(obs_time, units = "days"), tz = "UTC"),
Time_hour = as.POSIXct(trunc(obs_time, units = "hours"), tz = "UTC"),
# Total sleep time
Total_sleep_hrs = as.numeric(Total_sleep_time)*24,
# Other sleep variables (turn them to hours)
Time_fall_asleep = as.numeric(Time_it_took_to_fall_asleep)*24,
Time_awake_disturb = as.numeric(Total_time_awake_during_the_night_due_to_sleep_disturbances)*24,
Time_before_waking = as.numeric(Time_spent_in_bed_before_getting_up)*24,
Total_sleep_missed = Time_fall_asleep + Time_awake_disturb + Time_before_waking,
# Pre- and post-flight indicators
Pre = !is.na(Pre_Flight),
Post = !is.na(Post_Flight),
Time_Type = ifelse(Pre, "Pre", ifelse(Post, "Post", "Flight")),
Overall_Performance = as.numeric(Overall_Performance),
# Calculate days elapsed since first measurement for any participant
Earliest_hr = min(obs_time),
abs_days = as.numeric(difftime(time1 = obs_time,
time2 = Earliest_hr, units = "days")),
# Medication flags
Med_SleepAid = ifelse((Med1_Name %in% meds_sleep)|(Med2_Name %in% meds_sleep)|
(Med3_Name %in% meds_sleep)|(Med4_Name %in% meds_sleep)|
(Evening_Med1_Name %in% meds_sleep)|(Evening_Med2_Name %in% meds_sleep)|
(Evening_Med3_Name %in% meds_sleep)|(Evening_Med4_Name %in% meds_sleep), 1, 0),
Med_Pain = ifelse((Med1_Name %in% meds_pain)|(Med2_Name %in% meds_pain)|
(Med3_Name %in% meds_pain)|(Med4_Name %in% meds_pain)|
(Evening_Med1_Name %in% meds_pain)|(Evening_Med2_Name %in% meds_pain)|
(Evening_Med3_Name %in% meds_pain)|(Evening_Med4_Name %in% meds_pain), 1, 0),
Med_Antihist = ifelse((Med1_Name %in% meds_antihist)|(Med2_Name %in% meds_antihist)|
(Med3_Name %in% meds_antihist)|(Med4_Name %in% meds_antihist)|
(Evening_Med1_Name %in% meds_antihist)|(Evening_Med2_Name %in% meds_antihist)|
(Evening_Med3_Name %in% meds_antihist)|(Evening_Med4_Name %in% meds_antihist), 1, 0),
Med_Decongest = ifelse((Med1_Name %in% meds_dec)|(Med2_Name %in% meds_dec)|
(Med3_Name %in% meds_dec)|(Med4_Name %in% meds_dec)|
(Evening_Med1_Name %in% meds_dec)|(Evening_Med2_Name %in% meds_dec)|
(Evening_Med3_Name %in% meds_dec)|(Evening_Med4_Name %in% meds_dec), 1, 0),
# Computer Locations
SSC_str = str_locate(string = Computer_Description, pattern = "SSC")[,1],
SSC = substr(Computer_Description, start = SSC_str, stop = SSC_str + 5),
SSC = gsub(pattern = ")| |SSC", replacement = "", x = SSC),
Location = ifelse(SSC %in% c(20, 14, 21, 19, 13, 6), "Node 2",
ifelse(SSC %in% c(5, 8, 12, 4, 16), "US Lab",
"Avg")),
# For noise interpolation
# Morning = 7am until 11pm (16 hours)
# Evening = 11pm - 7am (8 hours)
Noise_Day = ((hour(obs_time) >= 7) & (hour(obs_time) < 23)),
# LRM-50 Metric
LRM_50 = as.numeric(LRM_50)) %>%
# Ensure variables are numeric
mutate_at(vars(Errors_of_comission,Errors_of_omission, MeanRRT, Caffeine), as.numeric) %>%
# Deal with missing values in the sleep variables by just carrying the non-NA results forward
mutate_at(vars(Low_Workload:Very_stressed), as.numeric) %>%
mutate_at(vars(Low_Workload:Total_sleep_hrs, Total_sleep_missed), na.locf, na.rm = FALSE) %>%
# For each participant....
group_by(Subject_ID) %>%
# Calculate person-level observation ID
# Calculate person-level pre-flight PVT
# Calculate PVT_Z as the person-specific Z score
mutate(obs = 1:n(),
min_day_sub = min(abs_days),
abs_days_sub = abs_days - min(abs_days),
Pre_PVT = mean(Overall_Performance[Time_Type == "Pre"]),
PVT_Z = (Overall_Performance - mean(Overall_Performance))/sd(Overall_Performance),
LRM_Z = (LRM_50 - mean(LRM_50))/sd(LRM_50),
LRM_mean = mean(LRM_50),
LRM_sd = sd(LRM_50),
# Subject's 10th and 90th percentile of LRM
LRM_10bestworst = ifelse(LRM_50 < quantile(LRM_50, 0.10), "Best (Lowest) 10% LRM", ""),
LRM_10bestworst = ifelse(LRM_50 > quantile(LRM_50, 0.90), "Worst (Highest) 10% LRM", LRM_10bestworst)) %>%
ungroup %>%
# Add EVA_Today and EVA_Tomorrow indicator variables
left_join(eva_tom, by = c('Date_of_Test_Completion' = 'EVA_Tomorrow_Date',
'Subject_ID' = 'ID',
'Test_Track_Type' = 'Time_of_Day')) %>%
left_join(eva_tod, by = c('Date_of_Test_Completion' = 'EVA_Today_Date',
'Subject_ID' = 'ID',
'Test_Track_Type' = 'Time_of_Day')) %>%
mutate(EVA_Today = ifelse(is.na(EVA_Today), 0, EVA_Today),
EVA_Tomorrow = ifelse(is.na(EVA_Tomorrow), 0, EVA_Tomorrow)) %>%
# Remove variables that were all missing in the data file
select(-Sleep_Shift_Proximity, -Machine_Type, -Adjusted_Hardware_Latency)
To create a single variable Self_PC_1
aggregating the
various VAS self-report measures, we performed a principal components
analysis of the following VAS response items:
# Self-Report Variables
rst_self = rst.clean %>%
select(Low_Workload,
Poor_Sleep_Quality,
Pre_PVT_Very_Sleepy,
Physically_Exhausted,
Mentally_Fatigued,
Fresh_Ready_to_Go,
Very_stressed) %>%
mutate_all(as.numeric) %>%
as.data.frame
# PCA
rst.pca_self <- prcomp(formula = ~., data = rst_self,
na.action = "na.omit", scale = TRUE)
# Get matrix of weights into correct shape
weights_self = rst.pca_self$rotation[,1] %>% as.matrix
# Get matrix of centers and scales into correct shape
cen = rst.pca_self$center %>% as.matrix %>% t
cen = cen[rep(1, times = nrow(rst_self)),]
sca = rst.pca_self$scale %>% as.matrix %>% t
sca = sca[rep(1, times = nrow(rst_self)),]
# Weight the scaled data to produce weighted variable
rst_scaled = ((data.matrix(rst_self) - cen)/sca)
rst_scaled[is.na(rst_scaled)] <- 0
# Multiply the scaled data by the loadings
vars_self = (rst_scaled %*% weights_self) %>%
as.data.frame %>%
`colnames<-`(paste0("Self_PC_", 1))
rst.clean <- rst.clean %>%
cbind(vars_self)
# Save the cleaned data into Final Data Deliverable folder
saveRDS(rst.clean,
file = file.path(dir.path, "rst_clean.rds"))
The Self_PC_1
variable was formed by weighting the 6
variables based on the Principal Component 1 loadings. In general, a
higher value of Self_PC_1
corresponds to feelings of worse
sleep quality, more exhaustion and mental fatigue, more stress, higher
workloads, and less freshness. Therefore, we named this variable the
Stress/Fatigue Composite Score.
To calculate the predicted lapses, the Windows batch program mcc13_v4 (McCauley et al. 2013; Van Dongen et al. 2004) was run using the observed sleep schedule (where the wake and bedtimes were carried forward if there was any missing data.) One participant did complete their sleep diary for any observation, so we imputed the mean number of predicted lapses for that person.
# After running mcc13_v4, read in the predicted data
dat_preds <- list.files(file.path(dir.path, "Circadian_Subject"),
pattern = "pred_", full.names = TRUE) %>%
map(preds_files_fun) %>%
Reduce(rbind,.)
# Process and save everything
saveRDS(dat_preds,
file.path(dir.path, "circadian_preds_raw.RDS"))
# For inflight RSTs...
rst.clean %>%
filter(Time_Type == "Flight") %>%
# Get the bedtime and sleep duration
select(Subject_ID, Time = obs_time, Day = Time_day,
Bedtime0 = Time_went_to_bed,
Duration_FallAsleep = Time_it_took_to_fall_asleep,
Total_sleep_hrs) %>%
# Format everything as integers for later
mutate(Bedtime_Minutes = as.numeric(Bedtime0)*24*60,
FallAsleep_Minutes = as.numeric(Duration_FallAsleep)*24*60,
Sleep_Minutes = as.integer(round(Total_sleep_hrs*60))) %>%
# For each participant, calculate the first and last days
group_by(Subject_ID) %>%
mutate(Day_First = min(Day),
Day_Last = max(Day)) %>%
ungroup %>%
# Only keep information from days when both bedtime & waketime are available
filter(!(is.na(Bedtime_Minutes)|is.na(FallAsleep_Minutes|is.na(Sleep_Minutes)))) %>%
mutate(Subject_ID = factor(Subject_ID)) %>%
split(.,.$Subject_ID) %>%
saveRDS(file.path(dir.path, "Circadian_Subject", "rst_sleep_raw.RDS"))
dat_sleep <- readRDS(file.path(dir.path, "Circadian_Subject", "rst_sleep_raw.RDS")) %>%
Reduce(rbind,.)
# Add the date back for each participant's data
unique(dat_sleep$Subject_ID) %>%
map(merge_circadian,
preds_dat = dat_preds) %>%
Reduce(rbind,.) %>%
# Hourly predicted lapses file
saveRDS(., file = file.path(dir.path, "circadian_preds_hour.RDS"))
# Daily predicted lapses file
readRDS(file.path(dir.path, "circadian_preds_hour.RDS")) %>%
mutate(Day = as.POSIXct(trunc(Date_Time, units = "days"))) %>%
# Take the daily average
group_by(ID, Day) %>%
summarize(Pred_Avg_Lapses = mean(Pred_Avg_Lapses)) %>%
ungroup %>%
saveRDS(., file = file.path(dir.path, "circadian_preds_daily.RDS"))
Integrating the many data sources into a single, “analytic” data frame for modelling required several strategies. One reason is that environmental variables were collected at different time intervals: Radiation was measured daily; Noise was measured hourly; and Temperature, CO2, and O2 were measured every few minutes. Moreover, these measurements came from different sensors distributed across the ISS. To match each RST data point to a single value of the environmental variable, we employed three tactics:
Further details on how each variable was handled are given below:
Age_at_Dock
)
and male indicator (Male
)Radiation_Dose
) in units of
mGy: this is in daily units, so it is smoothed using LOESS.ISS_Occupants
) or
number of people aboard the ISS: this is also in daily units.Noise_dBA
) in dBA: noise levels
were recorded only on a handful of days, so day (7am-11pm) and night
(11pm-7am) measurements were separately linearly interpolated.Temp
) in degrees Celsius:
use LOESS-smoothed Node 2/US Lab measurement if the RST was taken on a
computer in Node 2/US Lab. Otherwise, use the LOESS-smoothed weighted
average of the Node 2 (75%) and US Lab (25%) measurements.CO2
) in mmHg. The following
sensors were used depending on the calendar date (mm/dd/yyyy):ppO2
) in mmHg. The following
sensors were used depending on the calendar date:Pred_Lapses
): this
was calculated in the previous section.Because environmental data from the ISS was the only environmental data available (i.e., there was no environmental data collected for pre- or post-flight periods), the remainder of analyses only involve the inflight data.
rst.clean <- readRDS(paste0(dir.path, "rst_clean.rds"))
all_subject_ids = unique(rst.clean$Subject_ID)
# Read in data to be merged with the cleaned RST data, rst.clean
# Demographic
dem <- readRDS(paste0(dir.path, "demographic_vars.rds"))
# Noise - hourly
dat_noise_hr <- readRDS(paste0(dir.path, "noise_hr_loc.rds"))
# Radiation - daily
dat_rad_hr <- readRDS(paste0(dir.path, "rad_day.rds"))
# Temperature - hourly average
dat_temp_hr <- readRDS(paste0(dir.path, "temp_hr_loc.rds"))
# O2 - hourly average
dat_ppo2_hr <- readRDS(paste0(dir.path, "ppo2_hr.rds"))
# CO2 - hourly average
dat_co2_hr <- readRDS(paste0(dir.path, "co2_hourly.rds")) %>%
# Delete consecutive values that are repeated more than 3 times--
# these are likely due to sensor malfunction
mutate(CO2 = delete_reps(CO2))
# Circadian - hourly
# NOTE: program to predict lapses (mcc13_v4) is a Windows batch file, not shown here
dat_circ_hr <- readRDS(file.path(dir.path, "circadian_preds_hour.RDS")) %>%
mutate(Date_Time = fastPOSIXct(trunc(Date_Time, units = "hours"), tz = "UTC"))
dat_occ <- read_csv(paste0(dir.path, "iss_occupants.csv"), n_max = 1688) %>%
mutate(Time_day = as.POSIXct(Date, format = "%m/%d/%Y", tz = "UTC")) %>%
select(Time_day, ISS_Occupants = Occupants)
# Add demographic variables to both flight and non-flight data
rst_flight <- rst.clean %>%
mutate(Subject_ID = as.character(Subject_ID)) %>%
filter(Time_Type == "Flight") %>%
# Add demographic variables
left_join(., dem, by = c("Subject_ID" = "ID"))
rst_nonflight <- rst.clean %>%
mutate(Subject_ID = as.character(Subject_ID)) %>%
filter(Time_Type != "Flight") %>%
# Add demographic variables
left_join(., dem, by = c("Subject_ID" = "ID"))
rst.merged <- rst_flight %>%
# Merge environmental variables with in-flight only
left_join(., dat_noise_hr, by = c("Time_hour", "Location" = "Loc")) %>%
left_join(., dat_rad_hr, by = "Time_day") %>%
left_join(dat_temp_hr, by = c("Time_hour", "Location" = "Loc")) %>%
left_join(., dat_co2_hr, by = "Time_hour") %>%
left_join(., dat_ppo2_hr, by = "Time_hour") %>%
left_join(., dat_occ, by = "Time_day") %>%
# Circadian lapses
left_join(., dat_circ_hr, by = c("Time_hour" = "Date_Time",
"Subject_ID" = "ID")) %>%
# Merge back with the full data
bind_rows(rst_nonflight) %>%
arrange(Subject_ID, abs_days_sub)
# Save this file
saveRDS(rst.merged,
file = paste0(dir.path, "rst_full_hr.rds"))
These data were processed from their raw formats using the script given in the Appendix. Temperature, radiation, CO2, and O2 were smoothed using LOESS. Noise was smoothed using linear interpolation, separately for daytime (7 AM to 11PM) or nighttime (11PM to 6AM). Sleep diary data (for predicted lapses) was smoothed using LOCF.
# Temperature - fit separate LOESS for Node 2/US Lab/Averaged values
temp_hr <- readRDS(file.path(dir.path, "temp_hr_loc.RDS")) %>%
mutate(Time_hour2 = as.numeric(Time_hour)) %>% # LOESS only works on numeric domains
split(., .$Loc)
loess(Temp ~ Time_hour2, data = temp_hr$Avg, span = 0.1,
control = loess.control("direct")) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_temp_avg_hr.RDS"))
loess(Temp ~ Time_hour2, data = temp_hr$`Node 2`, span = 0.1,
control = loess.control("direct")) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_temp_no2_hr.RDS"))
loess(Temp ~ Time_hour2, data = temp_hr$`US Lab`, span = 0.1,
control = loess.control("direct")) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_temp_lab_hr.RDS"))
# CO2
co2_hr <- readRDS(file.path(dir.path, "co2_hourly.RDS")) %>%
mutate(Time_hour2 = as.numeric(Time_hour)) # LOESS only works on numeric domains
loess(CO2 ~ Time_hour2, data = co2_hr, span = 0.1,
control = loess.control("direct")) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_co2_hr.RDS"))
# O2
ppo2_hr <- readRDS(file.path(dir.path, "ppo2_hr.rds")) %>%
mutate(Time_hour2 = as.numeric(Time_hour)) # LOESS only works on numeric domains
loess(ppO2 ~ Time_hour2, data = ppo2_hr, span = 0.1,
control = loess.control("direct")) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_ppo2_hr.RDS"))
# Noise - fit separate linear interpolation for day/night observations
# We want to interpolate using DAILY values b/c otherwise we're just interpolating the last
# observation from 1 day to the first observation of the next day
# Since noise has a 24-hour cycle
noise_day <- readRDS(file.path(dir.path, "noise_day_noloc.rds")) %>%
split(., .$Noise_Day)
# Linear interpolation of day observations
approxfun(x = noise_day$`TRUE`$Time_day, y = noise_day$`TRUE`$Noise_dBA, rule = 2) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_noise_day1.RDS"))
# Linear interpolation of night observations
approxfun(x = noise_day$`FALSE`$Time_day, y = noise_day$`FALSE`$Noise_dBA, rule = 2) %>%
saveRDS(file.path(dir.path, "Smoothed Objects/smooth_noise_day2.RDS"))
These smoothed variables were then merged with the RST data to
produce the smoothed (variable name ending in _S
) and
smoothed + imputed variables (variable name ending in _CS
).
In the latter case, these variables were formed by using the actual
hourly mean value if available, and the smoothed variable otherwise. It
was the smoothed + imputed variables ultimately employed in our
prediction model:
Temp_CS
: temperatureNoise_dBA_CS
: noiseRadiation_Dose_CS
: radiationLapses_CS
: predicted lapsesCO2_CS
: CO2ppO2_CS
: O2rst.full_hr <-readRDS(paste0(dir.path, "rst_full_hr.rds")) %>%
mutate(Time_hour2 = as.numeric(Time_hour),
Time_day2 = as.numeric(Time_day))
# Smoothed noise, radiation, CO2, and O2
rst.smooth_hr <- rst.full_hr %>%
# Interpolate using daily noise/radiation values
smooth_fun(var = "Noise_dBA", day_level = TRUE, smooth.dir = smooth.path) %>%
smooth_fun(var = "Radiation_Dose", day_level = TRUE, smooth.dir = smooth.path) %>%
# Interpolate using LOESS hourly values
smooth_fun(var = "CO2", day_level = FALSE, smooth.dir = smooth.path) %>%
smooth_fun(var = "ppO2", day_level = FALSE, smooth.dir = smooth.path) %>%
smooth_fun(var = "Temp", day_level = FALSE, smooth.dir = smooth.path) %>%
arrange(Subject_ID, Time_hour) %>%
# Remove intermediate variables
select(-Time_hour2, -Time_day2)
# Save the smoothed data in the data directory as well as the
# Deliverable directory
saveRDS(rst.smooth_hr, file = file.path(dir.path, "rst_smooth_hr.RDS"))
saveRDS(rst.smooth_hr, file = file.path(del.path, "rst_smooth_hr.RDS"))
In the final deliverable, we included a cleaned file merging RST,
demographic, and environmental data (rst_clean.RDS
). We
wrote a data dictionary (Data_Dictionary.xlsx
) and the
analytic data file (rst_clean.RDS
) that includes smoothed
variables ultimately used in the prediction model.
# Read in existing data dictionary file
data_dic1 <- read_xlsx(file.path(del.path, "Data_Dictionary.xlsx"), sheet = "rst_smooth")
data_dic2 <- read_xlsx(file.path(del.path, "Data_Dictionary.xlsx"), sheet = "rst_clean")
# Update file with the most current column names in rst_smooth and rst_clean
cnames1 <- readRDS(paste0(dir.path, "rst_smooth_hr.RDS")) %>%
colnames()
sheet1 <- data.frame(Column.Name = cnames1) %>%
full_join(., data_dic1, by = "Column.Name")
cnames2 <- readRDS(paste0(dir.path, "rst_clean.rds")) %>%
colnames()
sheet2 <- data.frame(Column.Name = cnames2 ) %>%
full_join(., data_dic2, by = "Column.Name")
# Write new data dictionary file
fname = paste0(del.path, "/Data_Dictionary_", Sys.Date(), ".xlsx")
openxlsx::write.xlsx(list("rst_smooth" = sheet1,
"rst_clean" = sheet2), file = fname)
To prepare the data for modelling, we further refined the combined RST/environmental/demographic data:
_L1
.inflight_time
, which scaled each participant’s
elapsed inflight mission time to be between 0 and 1, in order to
preserve astronaut privacy.# Load analytic file
rst.smooth <- readRDS(paste0(dir.path, "rst_smooth_hr.rds")) %>%
# Keep only inflight, morning/evening, and original RST measurements
filter(Time_Type == "Flight",
Test_Track_Type %in% c("Morning", "Evening")) %>%
select(-Post_PVT_Very_Sleepy) %>%
select(Time_hour,
abs_days_sub,
Subject_ID,
# Overall Performance
Overall_Performance,
PVT_Z,
Test_Track_Type,
# person-level Variables
Male, Age_at_Dock, Pre_PVT,
# Performance Variables
Errors_of_comission:Other,
NumValidResp:MeanSRT__ms_,
# Sleep variables
Low_Workload:Very_stressed,
Total_sleep_hrs,
# Environmental
Radiation_Dose,
Noise_dBA,
Temp,
ppO2,
CO2,
ISS_Occupants,
# Smoothed environmental variables
Radiation_Dose_CS,
Noise_dBA_CS,
Temp_CS,
CO2_CS,
ppO2_CS,
# Medication Flags
Med_Antihist,
Med_SleepAid,
Med_Pain,
Med_Decongest,
# Self-Report Variables
Self_PC_1,
Low_Workload,
Poor_Sleep_Quality,
Very_stressed,
# Sleep Variables
Total_sleep_hrs,
Total_sleep_missed,
# EVA variables
EVA_Today, EVA_Tomorrow,
#LRM-50 metric
LRM_50,
LRM_Z,
LRM_mean, LRM_sd,
# Predicted Lapses
Pred_Avg_Lapses,
# Caffeine Flag
Caffeine) %>%
# Calculate the number of inflight hours elapsed
group_by(Subject_ID) %>%
mutate(Earliest_hr = min(Time_hour),
inflight_hrs_elapsed = as.numeric(difftime(time1 = Time_hour,
time2 = Earliest_hr, units = "hours")),
max_hrs_sub = max(inflight_hrs_elapsed),
# Observation number
obs_num = 1:n()) %>%
group_by(Subject_ID, Test_Track_Type) %>%
arrange(Time_hour) %>%
# Lagged Variables
mutate(Radiation_CS_L1 = lag0(Radiation_Dose_CS, 1),
Noise_CS_L1 = lag0(Noise_dBA_CS, 1),
CO2_CS_L1 = lag0(CO2_CS,1),
ppO2_CS_L1 = lag0(ppO2_CS, 1),
ISS_Occupants_L1 = lag0(ISS_Occupants, 1),
Temp_CS_L1 = lag0(Temp_CS,1),
Overall_Performance_L1 = lag0(Overall_Performance, 1),
Med_Antihist_L1 = lag0(Med_Antihist, 1),
Med_SleepAid_L1 = lag0(Med_SleepAid, 1),
Med_Pain_L1 = lag0(Med_Pain, 1),
Med_Decongest_L1 = lag0(Med_Decongest, 1),
Errors_Comission_L1 = lag0(Errors_of_comission, 1),
Errors_Omission_L1 = lag0(Errors_of_omission, 1),
MeanRRT_L1 = lag0(MeanRRT),
LRM_50_L1 = lag0(LRM_50),
Pred_Lapses_S = na.locf2(Pred_Avg_Lapses),
PVT_Z_L1 = lag0(PVT_Z),
LRM_Z_L1 = lag0(LRM_Z),
Caffeine_L1 = lag0(Caffeine),) %>%
ungroup %>%
# Turn factor variables into factor
mutate(Subject_ID = as.factor(Subject_ID),
# Create subj variable for fcr
subj = Subject_ID,
# Fractional and integer days
# person-specific hours elapsed since first in-flight observation
abs_inflight_hr = inflight_hrs_elapsed,
Male_l = Male, # True/False
Male_n = as.numeric(Male), # Numeric
Male = as.factor(Male), # Factor
Test_Track_Type_l = (Test_Track_Type == "Morning"),
Test_Track_Type_n = as.numeric(Test_Track_Type_l),
Test_Track_Type = as.factor(Test_Track_Type),
# Carry lapses forward. If all values are 0 for that participant/timetype then
# fill in with 0 for now.
Pred_Lapses_CS = ifelse(is.na(Pred_Avg_Lapses), Pred_Lapses_S, Pred_Avg_Lapses)) %>% #,
#Pred_Lapses_CS = ifelse(is.na(Pred_Lapses_CS), 0, Pred_Lapses_CS)) %>%
rowwise %>%
# Rescale time to [0,1] for each participant
mutate(inflight_time = scales::rescale(inflight_hrs_elapsed, from = c(0, max_hrs_sub), to = c(0,1)),
Pred_Lapses_CS = ifelse(is.na(Pred_Lapses_CS), mean(.$Pred_Lapses_CS, na.rm = TRUE), Pred_Lapses_CS)) %>%
ungroup %>%
arrange(Subject_ID, Time_hour) %>%
as.data.frame
saveRDS(rst.smooth,
file.path(dir.path, "rst_smooth_hourly.RDS"))
Another goal of the project was to identify a subset of the most important predictors of LRM-50. To do so, we used a random forest (Breiman 2001), a machine learning algorithm which employs an ensemble of regression trees. A random forest also calculates two metrics of variable importance:
We ranked variables based on %IncMSE over a series of resampling simulations. In each simulation, 50% of the data was sampled and the importance table, along with the ranks of each variable (in order of decreasing %IncMSE) was generated. This was repeated 100 times to get 100 rankings. For each variable, we calculated
in order to assess how stable these rankings were among randomly selected subsets of the data.
rst.smooth <- readRDS(file.path(dir.path, "rst_smooth_hourly.RDS"))
set.seed(123123)
rank_fun_sim(n_sum = 100, dat = rst.smooth,
samp_frac = 0.5, outcome = "LRM_50",
covars = covar_unlag, save.dir = sel.path)
rank_fun_sim(n_sum = 100, dat = rst.smooth,
samp_frac = 0.5, outcome = "LRM_Z",
covars = covar_unlag, save.dir = sel.path)
Ultimately, the most important predictors were selected as the
environmental variables and the 10 highest-scoring variables in terms of
percentage of simulations where the variable ranked in the Top 10
(Top10rate
):
# Top 10 + env variables for the model with LRM-50 outcome
t10_l = c("LRM_50_L1", "Pre_PVT", "Temp_CS", "Self_PC_1",
"Age_at_Dock", "Noise_dBA_CS", "Radiation_Dose_CS",
"Caffeine", "Very_stressed",
"ppO2_CS", "CO2_CS")
# Top 10 + env variables for the model with standardized LRM-50 outcome
t10_lz = c("LRM_Z_L1", "Self_PC_1","Noise_dBA_CS", "Age_at_Dock",
"Pre_PVT", "Total_sleep_hrs", "Temp_CS", "ppO2_CS", "Radiation_Dose_CS",
"Pred_Lapses_CS")
readRDS(file.path(sel.path, "varselect_forest_l_tab.RDS")) %>%
mutate(Variable = get_label(Variable)) %>%
arrange(desc(Top10rate), desc(Top15rate), MeanRank, MedRank, TopRank, BottomRank, Variable) %>%
pandoc.table(split.table = Inf)
Variable | Top10rate | Top15rate | TopRank | MeanRank | MedRank | BottomRank |
---|---|---|---|---|---|---|
LRM-50 (Lagged) | 1 | 1 | 1 | 1 | 1 | 1 |
Average Pre-flight OPS | 1 | 1 | 2 | 2 | 2 | 2 |
Temperature, Smoothed (Celsius) | 1 | 1 | 3 | 4 | 4 | 7 |
Fatigue Composite Score | 1 | 1 | 3 | 4.5 | 4 | 9 |
Age at Dock | 1 | 1 | 3 | 4.58 | 4 | 8 |
Noise, Smoothed (dBA) | 1 | 1 | 3 | 5.69 | 6 | 9 |
Radiation, Smoothed (mGy) | 0.98 | 1 | 4 | 7.83 | 8 | 12 |
Caffeine Doses | 0.96 | 1 | 5 | 7.68 | 8 | 14 |
Very Stressed | 0.75 | 1 | 4 | 9.32 | 9 | 15 |
Total Sleep Missed | 0.52 | 0.99 | 6 | 10.51 | 10 | 17 |
O2, Smoothed (mmHg) | 0.34 | 0.9 | 7 | 12.05 | 12 | 17 |
Predicted Lapses | 0.17 | 0.89 | 7 | 12.58 | 12 | 17 |
Total Sleep Hours | 0.16 | 0.9 | 7 | 12.89 | 13 | 17 |
CO2, Smoothed (mmHg) | 0.05 | 0.62 | 10 | 14.11 | 14 | 18 |
Poor Sleep Quality | 0.04 | 0.73 | 8 | 13.97 | 14 | 17 |
Male | 0.03 | 0.85 | 10 | 13.8 | 14 | 17 |
Low Workload | 0 | 0.12 | 12 | 16.81 | 17 | 20 |
Time of Day | 0 | 0 | 17 | 18.33 | 18 | 21 |
ISS Occupants | 0 | 0 | 16 | 18.72 | 19 | 23 |
Sleep Aid Flag | 0 | 0 | 18 | 20.65 | 20 | 24 |
Pain Medication Flag | 0 | 0 | 18 | 22.24 | 22 | 25 |
EVA Today | 0 | 0 | 19 | 22.24 | 22 | 25 |
EVA Tomorrow | 0 | 0 | 19 | 22.48 | 23 | 25 |
Decongestant Flag | 0 | 0 | 20 | 23.47 | 24 | 25 |
Antihistamine Flag | 0 | 0 | 20 | 23.55 | 24 | 25 |
nl()
# Plot the first 50 simulations to see how these rankings change
# Between sims
readRDS(file.path(sel.path, "varselect_forest_l.RDS")) %>%
filter(Sim <= 50) %>%
rank_fun_plot
## Error in is.data.frame(y): object 'vars_and_labels' not found
nl()
readRDS(file.path(sel.path, "varselect_forest_lz_tab.RDS")) %>%
mutate(Variable = get_label(Variable)) %>%
arrange(desc(Top10rate), desc(Top15rate), MeanRank, MedRank, TopRank, BottomRank, Variable) %>%
pandoc.table(split.table = Inf)
Variable | Top10rate | Top15rate | TopRank | MeanRank | MedRank | BottomRank |
---|---|---|---|---|---|---|
Standardized LRM-50 (Lagged) | 1 | 1 | 1 | 1.15 | 1 | 4 |
Fatigue Composite Score | 1 | 1 | 1 | 2.26 | 2 | 4 |
Noise, Smoothed (dBA) | 1 | 1 | 1 | 3.01 | 3 | 8 |
Age at Dock | 0.98 | 1 | 3 | 6.12 | 6 | 13 |
Average Pre-flight OPS | 0.98 | 1 | 3 | 6.18 | 6 | 11 |
Total Sleep Hours | 0.95 | 1 | 2 | 5.38 | 4 | 13 |
Temperature, Smoothed (Celsius) | 0.89 | 1 | 3 | 7.71 | 8 | 14 |
O2, Smoothed (mmHg) | 0.78 | 1 | 2 | 8.08 | 8 | 14 |
Radiation, Smoothed (mGy) | 0.66 | 0.99 | 4 | 9.19 | 9 | 17 |
Predicted Lapses | 0.41 | 0.97 | 4 | 10.86 | 11 | 17 |
CO2, Smoothed (mmHg) | 0.41 | 0.94 | 4 | 11.04 | 11 | 17 |
Poor Sleep Quality | 0.31 | 0.91 | 6 | 12.19 | 12 | 18 |
Very Stressed | 0.29 | 0.91 | 5 | 11.95 | 12 | 18 |
Caffeine Doses | 0.25 | 0.93 | 6 | 12.19 | 12 | 17 |
Total Sleep Missed | 0.07 | 0.59 | 8 | 14.75 | 15 | 21 |
Low Workload | 0.01 | 0.21 | 10 | 17.25 | 17 | 22 |
Time of Day | 0.01 | 0.19 | 10 | 17.37 | 17.5 | 22 |
Male | 0 | 0.16 | 12 | 17.32 | 17 | 21 |
ISS Occupants | 0 | 0.16 | 13 | 17.46 | 18 | 22 |
Pain Medication Flag | 0 | 0.03 | 14 | 22.11 | 23 | 25 |
EVA Tomorrow | 0 | 0.01 | 15 | 21.48 | 22 | 25 |
Sleep Aid Flag | 0 | 0 | 17 | 20.88 | 21 | 25 |
EVA Today | 0 | 0 | 17 | 22.12 | 22 | 25 |
Decongestant Flag | 0 | 0 | 18 | 23.09 | 23 | 25 |
Antihistamine Flag | 0 | 0 | 19 | 23.86 | 24 | 25 |
nl()
# Plot the first 50 simulations to see how these rankings change
# Between sims
readRDS(file.path(sel.path, "varselect_forest_lz.RDS")) %>%
filter(Sim <= 50) %>%
rank_fun_plot
## Error in is.data.frame(y): object 'vars_and_labels' not found
nl()
We fit the ensemble model on the entire data rst.smooth
.
To do so, we fit a linear mixed effects (LME) model, a functional
concurrent regression (FCR) model, and a random forest model on the full
data, using either the full set of covariates (“full”) or the top most
important variables (“top10”) and either the LRM-50 outcome or the
standardized LRM-50 outcome. Then, the ensemble prediction was the
simple average of the predictions from each of these models.
# RST Data with smoothed environmental variables
rst.smooth <- readRDS(file.path(dir.path, "rst_smooth_hourly.RDS"))
# Top 10 + env variables for the model with LRM-50 outcome
t10_l = c("LRM_50_L1", "Age_at_Dock", "Noise_dBA_CS",
"Pre_PVT", "Self_PC_1", "Temp_CS",
"Caffeine", "Radiation_Dose_CS", "Very_stressed",
"Total_sleep_missed",
"ppO2_CS", "CO2_CS")
# Top 10 + env variables for the model with standardized LRM-50 outcome
t10_lz = c("LRM_Z_L1", "Self_PC_1", "Noise_dBA_CS",
"Total_sleep_hrs", "Age_at_Dock",
"Pre_PVT", "Temp_CS", "ppO2_CS",
"Radiation_Dose_CS", "CO2_CS")
# Fit the models on the full data (so, no train/test split)
# Input:
# covars = vector of covariates' variable names
# outcome variable
# train_dat: training data to fit the model on
# time variable for FCR
# k = number of knots for FCR
# wgt_lrf = weights for the linear mixed effects, random forest, and function concurrent models in the ensemble
# type: character describing the model
# rescale_mse: rescale MSE for standardized LRM-50 outcome?
# Output (list)
fit_full_models <- function(covars = covar_mod, outcome = "LRM_50", train_dat = rst.smooth,
time_var = "inflight_time", k = 10,
# Weights for LME, RF, and FCR
wgt_lrf = c(0.333, 0.333, 0.333),
type = "full", rescale_mse = FALSE){
# Obtain name of lagged outcome variable
outcome_lag = get_lagged(outcome)
outcome_snip = get_snippet(outcome)
if (rescale_mse & outcome == "LRM_Z") outcome_snip <- paste0(outcome_snip, "r")
# Filename
fname = paste0("ensemble_", outcome_snip, "_", type, ".RDS")
message(fname)
# Fit linear mixed effects model with AR1 correlation
form = formula_linear(outcome = outcome, covariates = c(covars, outcome_lag))
ran.form = as.formula("~1|Subject_ID")
fit_lme = do.call(nlme::lme, list(fixed=form, random=ran.form,
data = train_dat,
correlation = corAR1(form = ~ obs_num|Subject_ID)))
pred_train_lme = predict(fit_lme, train_dat)
# Fit random forest
form = formula_linear(outcome = outcome,
covariates = c(covars, outcome_lag))
fit_forest = randomForest(form,
data = train_dat)
pred_train_for = predict(fit_forest, train_dat)
# Fit functional concurrent regression
form = formula_fcr(outcome = outcome, covariates = c(covars, outcome_lag),
time_var = time_var, K = k)
fit_fcr = do.call(fcr, list(formula = form, argvals = time_var,
subj = "subj", data = as.data.frame(train_dat),
use_bam = TRUE, nPhi = 1))
pred_train_fcr = predict(fit_fcr, newdata = train_dat,
se.fit = TRUE)$insample_predictions$fit
# Ensemble prediction: combine 3 outcomes using wgt_lrg
pred_train_ens = data.frame(pred_train_lme = pred_train_lme,
pred_train_for = pred_train_for,
pred_train_fcr = pred_train_fcr) %>%
as.matrix %>%
matrixStats::rowWeightedMeans(., w = wgt_lrf)
# Full data with the predicted outcome
rst_preds = train_dat %>%
mutate(lme = pred_train_lme,
forest = pred_train_for,
fcr = pred_train_fcr,
ensemble = pred_train_ens)
# Prediction Accuracy diagnostics
sd_train = sd(train_dat$LRM_50, na.rm = TRUE)
pred_train_mse = rst_preds %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome)) %>%
# Mean Squared Error
summarize(lme = mean((lme - obs)^2, na.rm = TRUE),
forest = mean((forest - obs)^2, na.rm = TRUE),
fcr = mean((fcr - obs)^2, na.rm = TRUE),
ensemble = mean((ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_train")
pred_train_rmse = pred_train_mse %>%
# RMSE/sd
mutate(lme = sqrt(lme)/sd_train,
forest = sqrt(forest)/sd_train,
fcr = sqrt(fcr)/sd_train,
ensemble = sqrt(ensemble)/sd_train,
Variable = "rmse_train")
# For z-scored LRM-50 outcome, rescale the MSE to compare to LRM-50 outcome
if (outcome == "LRM_Z" & rescale_mse){
sd_train = sd(train_dat$LRM_50, na.rm = TRUE)
pred_train_mse = rst_preds %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome), sd = LRM_sd) %>%
# Mean Squared Error
summarize(lme = mean((sd^2)*(lme - obs)^2, na.rm = TRUE),
forest = mean((sd^2)*(forest - obs)^2, na.rm = TRUE),
fcr = mean((sd^2)*(fcr - obs)^2, na.rm = TRUE),
ensemble = mean((sd^2)*(ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_train")
pred_train_rmse = pred_train_mse %>%
# RMSE/sd
mutate(lme = sqrt(lme)/sd_train,
forest = sqrt(forest)/sd_train,
fcr = sqrt(fcr)/sd_train,
ensemble = sqrt(ensemble)/sd_train,
Variable = "rmse_train")
}
# Output
out = list(
# Model Objects
obj_lme = fit_lme,
obj_forest = fit_forest,
obj_fcr = fit_fcr,
# Model predictions + input data
rst_preds = rst_preds,
# MSE and RMSE Diagnostics of ensemble model
mse_train = pred_train_mse$ensemble,
rmse_train = pred_train_rmse$ensemble)
# Save to filename
saveRDS(out, file.path(mod.path.final, fname))
}
# LRM-50 Outcome
fit_full_models(covars = covar_mod, outcome = "LRM_50",
train_dat = rst.smooth, type = "full")
fit_full_models(covars = t10_l, outcome = "LRM_50",
train_dat = rst.smooth, type = "top10")
# LRM-50 (Z-scored) Outcome
fit_full_models(covars = covar_mod, outcome = "LRM_Z",
train_dat = rst.smooth, type = "full")
fit_full_models(covars = t10_lz, outcome = "LRM_Z",
train_dat = rst.smooth, type = "top10")
# LRM-50 (Z-scored) Outcome - Rescale MSE
fit_full_models(covars = covar_mod, outcome = "LRM_Z",
train_dat = rst.smooth, type = "full", rescale_mse = TRUE)
fit_full_models(covars = t10_lz, outcome = "LRM_Z",
train_dat = rst.smooth, type = "top10", rescale_mse = TRUE)
Using the fitted functional regression model, we generate heat maps to visualize the relationship between time, environmental variables, and predicted LRM-50.
# To display the contour map, fit FCR object on the given time grid
time_grid = seq(from = 0, to = 1, length.out = 50)
# LRM-50 Outcome
fcr_grid_obj(preds = "full", covars = covar_mod, outcome = "LRM_50")
fcr_grid_obj(preds = "top10", covars = t10_l, outcome = "LRM_50")
# Standardized LRM-50 Outcome
fcr_grid_obj(preds = "full", covars = covar_mod, outcome = "LRM_Z")
fcr_grid_obj(preds = "top10", covars = t10_lz, outcome = "LRM_Z")
fcr_obj <- readRDS(file.path(mod.path.final, "fcr_grid_full_l.RDS"))
fcr_heatmap_1var(dat0 = rst.smooth,
fcr_obj, outcome = "LRM_50",
covars = covar_mod,
var1 = "Radiation_Dose_CS",
lrm_lims = c(-35.9, -28))
## Error in is.data.frame(y): object 'vars_and_labels' not found
fcr_heatmap_1var(dat0 = rst.smooth,
fcr_obj, outcome = "LRM_50",
covars = covar_mod,
var1 = "Temp_CS",
lrm_lims = c(-35.9, -28))
## Error in is.data.frame(y): object 'vars_and_labels' not found
fcr_heatmap_1var(dat0 = rst.smooth,
fcr_obj, outcome = "LRM_50",
covars = covar_mod,
var1 = "Noise_dBA_CS",
lrm_lims = c(-35.9, -28))
## Error in is.data.frame(y): object 'vars_and_labels' not found
fcr_heatmap_1var(dat0 = rst.smooth,
fcr_obj, outcome = "LRM_50",
covars = covar_mod,
var1 = "CO2_CS",
lrm_lims = c(-35.9, -28))
## Error in is.data.frame(y): object 'vars_and_labels' not found
fcr_heatmap_1var(dat0 = rst.smooth,
fcr_obj, outcome = "LRM_50",
covars = covar_mod,
var1 = "ppO2_CS",
lrm_lims = c(-35.9, -28))
## Error in is.data.frame(y): object 'vars_and_labels' not found
We used the following validation schema to determine which model (linear mixed effects, functional regression, random forest, or ensemble) performed best under repeated subsampling of the data. For each of the 4 models, prediction performance was primarily measured in mean squared error (MSE) and was assessed via the following:
For a participant \(i\), number of training days \(t\), and run number \(k\):
Repeat by looping over all participants, over all \(t \in \{5,10,15,... , 50\}\), and over \(k\) ranging from 1 to 20. To calculate the MSE in the training/test sets at a given value of \(t\), we take the average over all runs within a participant, and then the average over all participants (so that participants with fewer data points are weighted equally). This yields one value of Train/Test MSE for each value of \(t\).
pred_ensemble <- function(covar_type = "full", outcome = "LRM_50",
train_dat, test_dat,
time_var = "inflight_time", k = 10,
# Weights for LME, RF, and FCR
wgt_lrf = c(0.333, 0.333, 0.333),
rescale_mse = FALSE){
# Set of covariates
covars <- if(covar_type == "full") {
covar_mod
} else {
if (outcome == "LRM_50") t10_l else t10_z
}
# Lagged outcome variable
outcome_lag = get_lagged(outcome)
# Predict LME
form = formula_linear(outcome = outcome, covariates = c(covars, outcome_lag))
ran.form = as.formula("~1|Subject_ID")
fit = do.call(nlme::lme, list(fixed=form, random=ran.form,
data = train_dat,
correlation = corAR1(form = ~ obs_num|Subject_ID)))
pred_test_lme = predict(fit, test_dat)
pred_train_lme = predict(fit, train_dat)
# Predict Random Forest
form = formula_linear(outcome = outcome,
covariates = c(covars, outcome_lag))
fit = randomForest(form,
data = train_dat)
pred_test_for = predict(fit, test_dat)
pred_train_for = predict(fit, train_dat)
# Predict Functional Concurrent Regression
form = formula_fcr(outcome = outcome, covariates = c(covars, outcome_lag),
time_var = time_var, K = k)
fit = do.call(fcr, list(formula = form, argvals = time_var,
subj = "subj", data = as.data.frame(train_dat),
use_bam = TRUE, argvals.new = test_dat[,time_var], nPhi = 1))
pred_test_fcr = predict(fit, newdata = test_dat, se.fit = TRUE)$insample_predictions$fit
pred_train_fcr = predict(fit, newdata = train_dat, se.fit = TRUE)$insample_predictions$fit
############ ENSEMBLE DATA ###############
# Ensemble prediction: combine 3 outcomes using wgt_lrg
pred_test_ens = data.frame(pred_test_lme = pred_test_lme,
pred_test_for = pred_test_for,
pred_test_fcr = pred_test_fcr) %>%
as.matrix %>%
matrixStats::rowWeightedMeans(., w = wgt_lrf)
pred_train_ens = data.frame(pred_train_lme = pred_train_lme,
pred_train_for = pred_train_for,
pred_train_fcr = pred_train_fcr) %>%
as.matrix %>%
matrixStats::rowWeightedMeans(., w = wgt_lrf)
######### MODEL PERFORMANCE #########
# Full data with the predicted outcome
rst_preds_train = train_dat %>%
mutate(lme = pred_train_lme,
forest = pred_train_for,
fcr = pred_train_fcr,
ensemble = pred_train_ens)
rst_preds_test = test_dat %>%
mutate(lme = pred_test_lme,
forest = pred_test_for,
fcr = pred_test_fcr,
ensemble = pred_test_ens)
# Rescale MSE?
if (outcome == "LRM_Z" & rescale_mse){
sd_test = sd(test_dat$LRM_50, na.rm = TRUE)
sd_train = sd(train_dat$LRM_50, na.rm = TRUE)
# Change sd_test to be sd of the whole data
if (nrow(test_dat) == 1){
sd_test = sd(train_dat$LRM_50, na.rm = TRUE)
}
# Train
pred_train_mse = rst_preds_train %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome), sd = LRM_sd) %>%
# Mean Squared Error
summarize(lme = mean((sd^2)*(lme - obs)^2, na.rm = TRUE),
forest = mean((sd^2)*(forest - obs)^2, na.rm = TRUE),
fcr = mean((sd^2)*(fcr - obs)^2, na.rm = TRUE),
ensemble = mean((sd^2)*(ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_train")
# Test
pred_test_mse = rst_preds_test %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome), sd = LRM_sd) %>%
# Mean Squared Error
summarize(lme = mean((sd^2)*(lme - obs)^2, na.rm = TRUE),
forest = mean((sd^2)*(forest - obs)^2, na.rm = TRUE),
fcr = mean((sd^2)*(fcr - obs)^2, na.rm = TRUE),
ensemble = mean((sd^2)*(ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_test")
} else {
# Calculate MSE - Test
sd_test = sd(test_dat[,outcome], na.rm = TRUE)
sd_train = sd(train_dat[,outcome], na.rm = TRUE)
# Change sd_test to be sd of the whole data
if (nrow(test_dat) == 1){
sd_test = sd_train
}
# Calculate MSE - Train
pred_train_mse = rst_preds_train %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome)) %>%
# Mean Squared Error
summarize(lme = mean((lme - obs)^2, na.rm = TRUE),
forest = mean((forest - obs)^2, na.rm = TRUE),
fcr = mean((fcr - obs)^2, na.rm = TRUE),
ensemble = mean((ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_train")
pred_test_mse = rst_preds_test %>%
select(Subject_ID, lme, forest, fcr, ensemble, obs = all_of(outcome)) %>%
# Mean Squared Error
summarize(lme = mean((lme - obs)^2, na.rm = TRUE),
forest = mean((forest - obs)^2, na.rm = TRUE),
fcr = mean((fcr - obs)^2, na.rm = TRUE),
ensemble = mean((ensemble - obs)^2, na.rm = TRUE)) %>%
mutate(Variable = "mse_test")
}
# Return:
# Table of prediction diagnostics
# Full data w/ predicted outcome
mse_table = rbind(pred_test_mse, pred_test_rmse,
pred_train_mse, pred_train_rmse)
out = list(mse_table = mse_table,
# Return data with predictions
rst_preds_train = rst_preds_train,
rst_preds_test = rst_preds_test)
return(out)
}
# Append the participant ID to pred_ensemble output
pred_ensemble_oos <- function(the_subj = 1001, ...){
out = pred_ensemble(...)$`mse_table` %>%
mutate(Subject_ID = the_subj)
return(out)
}
# For a given participant (the_subj), split data (dat) as described above
sample_oos <- function(dat, the_subj = 1001, run_num = 1,
training_days, testing_days = 1){
# All other participants = training set
dat_others <- dat %>%
filter(Subject_ID != the_subj)
# For this participant,
dat_subj <- dat %>% filter(Subject_ID == the_subj)
# Throw error if not enough data
if (training_days + testing_days > nrow(dat_subj)){
stop("Not enough data to split into the training/test days specified.")
}
inds_train = seq(from = run_num,
to = run_num + training_days - 1)
inds_test = seq(from = run_num + training_days,
to = run_num + training_days + testing_days - 1)
# Throw error if not enough data
if (!all(inds_train < nrow(dat_subj))){
stop("Not enough training data")
}
if (!all(inds_test < nrow(dat_subj))){
stop("Not enough testing data")
}
dat_train <- dat_others %>%
bind_rows(dat_subj %>% slice(inds_train)) %>%
arrange(Subject_ID, inflight_time)
dat_test <- dat_subj %>%
slice(inds_test)
return(list(Train = dat_train, Test = dat_test,
# Actual number of rows in train/test for this participant
train_n = length(inds_train),
test_n = length(inds_test)))
}
# For a dataset (dat0) and participant (the_subj),
# Split the data based on the number of training/test days (training_days/testing_days)
# Using either full, or top 10 covariates (covar_type)
# and a given outcome variable
divide_data_oos <- function(dat0 = rst.smooth, the_subj = 1001, training_days = 30,
testing_days = 1, covar_type = "full",
outcome = "LRM_50", mod_path = day1.path,
rescale_mse = FALSE){
stopifnot(training_days >= 5)
stopifnot(covar_type %in% c("full", "top10"))
# For each participant, sort the data by time
dat <- dat0 %>%
arrange(Subject_ID, inflight_time)
N_subj = dat %>% filter(Subject_ID == the_subj) %>%
nrow()
run_seq = seq(from = 1,
to = min(20, N_subj - testing_days - training_days))
for (i in run_seq){
# Split data into training and test sets
rst.split <- sample_oos(dat = dat, the_subj = the_subj, run_num = i,
training_days = training_days, testing_days = testing_days)
rst.train <- rst.split[["Train"]]
rst.test <- rst.split[["Test"]]
# Number of participants
n_train = rst.split[["train_n"]]
n_test = rst.split[["test_n"]]
# Outcome
out_snip = get_snippet(outcome)
if (outcome == "LRM_Z" & rescale_mse) out_snip = paste0(out_snip, "r")
# Select/create model directory
mod_save_dir <- file.path(mod_path, paste0("OOS_", out_snip, "_", covar_type))
if (!dir.exists(mod_save_dir)) dir.create(mod_save_dir)
# Run ensemble model
fname_out = paste0("r", i, "_train", training_days, "_test",
testing_days, "_" ,the_subj, ".RDS")
message(paste0("Saving ", file.path(paste0("OOS_", out_snip, "_", covar_type),
fname_out)))
# Predict ensemble
pred_all = pred_ensemble_oos(the_subj = the_subj,
covar_type = covar_type, outcome = "LRM_50",
train_dat = rst.train, test_dat = rst.test,
rescale_mse = rescale_mse) %>%
mutate(Run = i,
Training_Days = training_days,
Train_actual = n_train,
Testing_days = testing_days,
Test_actual = n_test)
# Save this file
saveRDS(pred_all, file.path(mod_save_dir, fname_out))
}
}
# Run the code
# Out-of-sample preds\
train_seq = seq(from = 5, to = 50, by = 5)
all_subs = unique(rst.smooth$Subject_ID) %>% as.character
for (i in train_seq){ # Over all numbers of training days
# Full set of covariates
parallel::mclapply(X = all_subs,
FUN = divide_data_oos,
dat0 = rst.smooth,
training_days = i, testing_days = 1,
covar_type = "full", outcome = "LRM_50",
mod_path = day1.path,
mc.cores = n_cores)
# Top 10 covariates
parallel::mclapply(X = all_subs,
FUN = divide_data_oos,
dat0 = rst.smooth,
training_days = i, testing_days = 1,
model_type = "top10", outcome = "LRM_50",
mod_path = day1.path,
mc.cores = n_cores)
}
# Summarize data for plots
# list files and process
list.files(file.path(day1.path, "OOS_l_full"),
full.names = TRUE, pattern = ".RDS") %>%
map(readRDS) %>%
Reduce(rbind,.) %>%
# Take average over all participants, variables, and Training Days
group_by(Variable, Training_Days) %>%
summarize_at(c("lme", "forest", "fcr", "ensemble"),
mean, na.rm = TRUE) %>%
gather(Model, Value, -Variable, -Training_Days) %>%
mutate(Model0 = Model,
Model = recode_factor(Model0,
"fcr"= "Functional Concurrent Regression",
"forest" = "Random Forest",
"lme" = "Linear Mixed Effects",
"ensemble" = "Ensemble")) %>%
saveRDS(file.path(mod.path, "performance_OOS_l_full.RDS"))
list.files(file.path(day1.path, "OOS_l_top10"),
full.names = TRUE, pattern = ".RDS") %>%
map(readRDS) %>%
Reduce(rbind,.) %>%
# Take average over all participants, variables, and Training Days
group_by(Variable, Training_Days) %>%
summarize_at(c("lme", "forest", "fcr", "ensemble"),
mean, na.rm = TRUE) %>%
gather(Model, Value, -Variable, -Training_Days) %>%
mutate(Model0 = Model,
Model = recode_factor(Model0,
"fcr"= "Functional Concurrent Regression",
"forest" = "Random Forest",
"lme" = "Linear Mixed Effects",
"ensemble" = "Ensemble")) %>%
saveRDS(file.path(mod.path, "performance_OOS_l_top10.RDS"))
list.files(file.path(day1.path, "OOS_lzr_full"),
full.names = TRUE, pattern = ".RDS") %>%
map(readRDS) %>%
Reduce(rbind,.) %>%
# Take average over all participants, variables, and Training Days
group_by(Variable, Training_Days) %>%
summarize_at(c("lme", "forest", "fcr", "ensemble"),
mean, na.rm = TRUE) %>%
gather(Model, Value, -Variable, -Training_Days) %>%
mutate(Model0 = Model,
Model = recode_factor(Model0,
"fcr"= "Functional Concurrent Regression",
"forest" = "Random Forest",
"lme" = "Linear Mixed Effects",
"ensemble" = "Ensemble")) %>%
saveRDS(file.path(mod.path, "performance_OOS_lzr_full.RDS"))
list.files(file.path(day1.path, "OOS_lzr_top10"),
full.names = TRUE, pattern = ".RDS") %>%
map(readRDS) %>%
Reduce(rbind,.) %>%
# Take average over all participants, variables, and Training Days
group_by(Variable, Training_Days) %>%
summarize_at(c("lme", "forest", "fcr", "ensemble"),
mean, na.rm = TRUE) %>%
gather(Model, Value, -Variable, -Training_Days) %>%
mutate(Model0 = Model,
Model = recode_factor(Model0,
"fcr"= "Functional Concurrent Regression",
"forest" = "Random Forest",
"lme" = "Linear Mixed Effects",
"ensemble" = "Ensemble")) %>%
saveRDS(file.path(mod.path, "performance_OOS_lzr_top10.RDS"))
# Plot prediction performance
# Plot prediction performance
plot_fun_oos <- function(dir_dat = day1.path, covar_type = "full", outcome = "LRM_50"){
out_snip = get_snippet(outcome)
fname <- file.path(dir_dat,
paste0("performance_OOS_", out_snip, "_", covar_type, ".RDS"))
all_dat <- readRDS(fname)
vars = c("lme", "forest", "fcr", "ensemble")
perf_dat_mse <- all_dat %>%
filter(Model0 %in% vars) %>%
select(Model, Training_Days, Variable, Value) %>%
mutate(Variable = recode(Variable,
"mse_train" = "MSE (Train)",
"mse_test" = "MSE (Test)",
"rmse_train" = "RMSE/sd (Train)",
"rmse_test" = "RMSE/sd (Test)"),
Variable = factor(Variable,
levels = c("MSE (Train)", "MSE (Test)",
"RMSE/sd (Train)", "RMSE/sd (Test)")))
perf_dat_rmse <- all_dat %>%
filter(Model0 %in% vars) %>%
mutate(Variable2 = ifelse(Variable == "mse_test", "rmse_test", "rmse_train")) %>%
select(Model, Training_Days, Variable = Variable2, Value = RMSE_sd) %>%
mutate(Variable = recode(Variable,
"mse_train" = "MSE (Train)",
"mse_test" = "MSE (Test)",
"rmse_train" = "RMSE/sd (Train)",
"rmse_test" = "RMSE/sd (Test)"),
Variable = factor(Variable,
levels = c("MSE (Train)", "MSE (Test)",
"RMSE/sd (Train)", "RMSE/sd (Test)")))
perf_dat <- bind_rows(perf_dat_mse, perf_dat_rmse)
# Format title
# string1 <- "Out-of-sample"
string1 = paste0(get_label(outcome), " Outcome, ")
string2 <- switch(covar_type,
"full" = "Full Covariates",
"top10" = "Top 10 Covariates",
"top10nn" = "Top 10 (No Noise) Covariates",
"topany" = "Top 10 (+ Env) Covariates",
"topanynn" = "Top 10 (+ Env - Noise) Covariates")
string3 = paste0(string1, string2)
# Plot
p_perf_mse = ggplot(perf_dat_mse) +
geom_point(aes(x = Value, y = Training_Days,
color = Model, shape = Model)) +
scale_color_manual(breaks = c("Functional Concurrent Regression", "Random Forest", "Linear Mixed Effects", "Ensemble"),
values = c("#117733", "#CC6677", "#88CCEE", "#332288")) +
scale_shape_manual(breaks = c("Functional Concurrent Regression", "Random Forest", "Linear Mixed Effects", "Ensemble"),
values = c(2, 4, 8, 16)) +
facet_wrap(~Variable, scales = "fixed") +
theme_minimal() +
theme(legend.position = "none",
text = element_text(size = 10)) +
labs(x = "", y = "", title = string3)
p_perf_rmse = ggplot(perf_dat_rmse) +
geom_point(aes(x = Value, y = Training_Days,
color = Model, shape = Model)) +
scale_color_manual(breaks = c("Functional Concurrent Regression", "Random Forest", "Linear Mixed Effects", "Ensemble"),
values = c("#117733", "#CC6677", "#88CCEE", "#332288")) +
scale_shape_manual(breaks = c("Functional Concurrent Regression", "Random Forest", "Linear Mixed Effects", "Ensemble"),
values = c(2, 4, 8, 16)) +
facet_wrap(~Variable, scales = "fixed") +
theme_minimal() +
theme(legend.position = "none",
text = element_text(size = 10)) +
labs(x = "Prediction Error", y = "Days in Training Set", title = string3)
p_perf = arrangeGrob(p_perf_mse, p_perf_rmse, ncol = 1)
# Table: same data as plot
tab_perf = perf_dat %>%
pivot_wider(names_from = "Model", values_from = "Value") %>%
group_by(Variable) %>%
summarize_at(vars(`Linear Mixed Effects`:`Ensemble`), mean)
# Table 2: same data as plot, but averaged over t
tab_perf_avg = perf_dat %>%
pivot_wider(names_from = "Model", values_from = "Value") %>%
filter(Variable %in% c("MSE (Train)","MSE (Test)")) %>%
group_by(Variable) %>%
summarize_at(vars(`Linear Mixed Effects`:`Ensemble`), mean)
return(list(p_perf, tab_perf, tab_perf_avg))
}
rank_table_fun_oos <- function(dir_dat = day1.path, covar_type = "full", outcome = "LRM_50"){
out_snip = get_snippet(outcome)
fname <- file.path(dir_dat,
paste0("performance_OOS_", out_snip, "_", covar_type, ".RDS"))
all_dat <- readRDS(fname)
vars = c("lme", "forest", "fcr", "ensemble")
perf_dat_mse <- all_dat %>%
filter(Model0 %in% vars) %>%
select(Model, Training_Days, Variable, Value)
perf_dat_rmse <- all_dat %>%
filter(Model0 %in% vars) %>%
mutate(Variable2 = ifelse(Variable == "mse_test", "rmse_test", "rmse_train")) %>%
select(Model, Training_Days, Variable = Variable2, Value = RMSE_sd)
bind_rows(perf_dat_mse, perf_dat_rmse) %>%
# For each value of Training_Days, rank the models
group_by(Training_Days, Variable) %>%
mutate(Rank = rank(Value)) %>%
group_by(Model, Variable) %>%
summarize(Mean_Rank = mean(Rank)) %>%
arrange(Variable, Mean_Rank) %>%
filter(Variable %in% c("mse_test")) %>%
pandoc.table(split.table = Inf)
}
# Performance, by t
plot_fun_oos(covar_type = "full", outcome = "LRM_50")[[1]] %>% grid.draw()# Performance averaged over t
plot_fun_oos(covar_type = "full", outcome = "LRM_50")[[3]] %>% pt()
Variable | Linear Mixed Effects | Random Forest | Functional Concurrent Regression | Ensemble |
---|---|---|---|---|
MSE (Train) | 45.54 | 8.574 | 41.9 | 27.77 |
MSE (Test) | 48.57 | 51.46 | 48.48 | 46.45 |
# Rank table
rank_table_fun_oos(covar_type = "full", outcome = "LRM_50")
Model | Variable | Mean_Rank |
---|---|---|
Ensemble | mse_test | 1.2 |
Linear Mixed Effects | mse_test | 2.6 |
Functional Concurrent Regression | mse_test | 2.8 |
Random Forest | mse_test | 3.4 |
# Performance, by t
plot_fun_oos(covar_type = "top10", outcome = "LRM_50")[[1]] %>% grid.draw()# Performance averaged over t
plot_fun_oos(covar_type = "top10", outcome = "LRM_50")[[3]] %>% pt()
Variable | Linear Mixed Effects | Random Forest | Functional Concurrent Regression | Ensemble |
---|---|---|---|---|
MSE (Train) | 46.64 | 8.855 | 43.89 | 28.74 |
MSE (Test) | 49.27 | 51.37 | 49.3 | 47.06 |
# Rank table
rank_table_fun_oos(covar_type = "top10", outcome = "LRM_50")
Model | Variable | Mean_Rank |
---|---|---|
Ensemble | mse_test | 1.2 |
Functional Concurrent Regression | mse_test | 2.8 |
Linear Mixed Effects | mse_test | 2.8 |
Random Forest | mse_test | 3.2 |
Overall, the ensemble model displayed the smallest MSE across all simulations and participants in the test set. As expected, the random forest model had the smallest MSE in the training set, but it appears to overfit on the data, performing worse than the ensemble model in the unseen test data.
# Performance, by t
plot_fun_oos(covar_type = "full", outcome = "LRM_Z_R")[[1]] %>% grid.draw()# Performance averaged over t
plot_fun_oos(covar_type = "full", outcome = "LRM_Z_R")[[3]] %>% pt()
Variable | Linear Mixed Effects | Random Forest | Functional Concurrent Regression | Ensemble |
---|---|---|---|---|
MSE (Train) | 45.54 | 8.576 | 41.9 | 27.77 |
MSE (Test) | 48.57 | 51.62 | 48.48 | 46.5 |
# Rank table
rank_table_fun_oos(covar_type = "full", outcome = "LRM_Z_R")
Model | Variable | Mean_Rank |
---|---|---|
Ensemble | mse_test | 1.2 |
Linear Mixed Effects | mse_test | 2.6 |
Functional Concurrent Regression | mse_test | 2.8 |
Random Forest | mse_test | 3.4 |
# Performance, by t
plot_fun_oos(covar_type = "top10", outcome = "LRM_Z_R")[[1]] %>% grid.draw()# Performance averaged over t
plot_fun_oos(covar_type = "top10", outcome = "LRM_Z_R")[[3]] %>% pt()
Variable | Linear Mixed Effects | Random Forest | Functional Concurrent Regression | Ensemble |
---|---|---|---|---|
MSE (Train) | 46.64 | 8.855 | 43.89 | 28.74 |
MSE (Test) | 49.27 | 51.24 | 49.3 | 47.01 |
# Rank table
rank_table_fun_oos(covar_type = "top10", outcome = "LRM_Z_R")
Model | Variable | Mean_Rank |
---|---|---|
Ensemble | mse_test | 1.2 |
Functional Concurrent Regression | mse_test | 2.7 |
Linear Mixed Effects | mse_test | 2.8 |
Random Forest | mse_test | 3.3 |
This script was run to process the environmental data in its raw format, in order to accommodate for spatial/temporal aggregation and integration with the RST data. For more details, see Section 2.4.
# Set these to the location of the noise/temp/radiation/CO2 data
noise.dir <- file.path(box_dir, "Acoustics Data/")
temp.dir <- file.path(box_dir, "Temperature and ppO2 data/")
rad.dir <- file.path(box_dir, "Radiation data/")
co2.dir <- file.path(box_dir, "CO2 Data/")
# For a data set dat that is grouped by Year, Month, Day, (Hour)
# The name of the US Lab variable, and the Node 2 variable
# And a stub for the output variables
# Rename variables, tidy up, and add aggregate variable
aggr_loc <- function(dat, lab_var, node_var){
# Make everything
lab_var2 = enquo(lab_var)
node_var2 = enquo(node_var)
# Weighted mean of the US Lab and Node 2 volumns
tmp = dat %>%
ungroup %>%
select(!!lab_var2, !!node_var2)
w_avg = tmp %>%
data.matrix %>% # Coerce to matrix
rowWeightedMeans(w = c(0.25, 0.75), na.rm = TRUE) %>%
replace_nan_with_na()
new_colnames = c(colnames(dat %>% select(contains("Time"))),
"Node 2", "US Lab", "Avg")
dat %>%
ungroup %>%
select(contains("Time_"), !!node_var2, !!lab_var2) %>%
mutate(Avg = w_avg) %>%
`colnames<-`(new_colnames)
}
The Temperature and O2 data were combined in the same excel file.
# temp_hr_loc.RDS
# Read in all of the temperature files EXCEPT 2009 one (which is different)
temp_files <- list.files(temp.dir, full.names = TRUE) %>%
# Remove files with ~$ in the filename
setdiff(., .[grepl("2009", .)]) %>%
setdiff(., .[grepl("copy", .)])
# 2009 file
fname_09 <- list.files(temp.dir, full.names = TRUE,
pattern = "2009 275 - 2010 077 Temp and O2 Combined converted.xlsx")
# Read the 2009 file in
dat09 <- read_excel(fname_09,
col_types = c("date", "numeric", "numeric",
"date", rep("numeric", 5))) %>%
slice(3:nrow(.)) %>% # Skip first couple lines as they are empty
select(-3) %>% # Get rid of the 3rd column as it's all missing
# Rename the LAER12FC0972P as Lab MCA
rename(LAER12FC0972P = `Lab MCA`,
# Rename the first column Time1 and the 3rd column Time2
Time1 = `...1`,
Time2 = `...4`) %>%
# Categorize temp sensors by location: US Lab, Node 2
select(Timestamp = Time1,
LAET23SR0201T,
LAET27SR0201T,
N2ET21SR0201T) %>%
gather(Sensor, Temp, -Timestamp) %>%
mutate(Location = recode(Sensor,
"LAET23SR0201T" = "US Lab",
"LAET27SR0201T"= "US Lab",
"N2ET21SR0201T" = "Node 2",
.default = "Other")) %>%
group_by(Timestamp, Location) %>%
summarize(Temp = mean(Temp, na.rm = TRUE)) %>%
filter(!is.na(Temp))
# Read and clean the data
data_temp <- function(fname){
print(fname)
dat_ncol <- read_excel(fname, n_max = 3) %>% ncol()
dat <- read_excel(fname,
col_types = c("date", rep("numeric",
times = dat_ncol - 1))) %>%
slice(3:nrow(.)) %>%
rename(Timestamp = `...1`) %>%
# Categorize temp sensors by location: US Lab, Node 2, Other
select(Timestamp,
LAET23SR0201T,
LAET27SR0201T,
N2ET21SR0201T) %>%
gather(Sensor, Temp, -Timestamp) %>%
mutate(Location = recode(Sensor,
"LAET23SR0201T" = "US Lab",
"LAET27SR0201T"= "US Lab",
"N2ET21SR0201T" = "Node 2",
.default = "Other")) %>%
group_by(Timestamp, Location) %>%
summarize(Temp = mean(Temp, na.rm = TRUE)) %>%
filter(!is.na(Temp))
return(dat)
}
out_temp = temp_files %>%
as.list() %>%
map(data_temp) %>%
reduce(full_join)
# Glue the 2009 and post-2009 data together
dat_temp <- rbind(dat09, out_temp) %>%
# Get the hourly information
mutate(Time0 = fastPOSIXct(Timestamp, tz = "UTC"),
Time_day = fastPOSIXct(trunc(Time0, units = "days"), tz = "UTC"),
Time_hour = fastPOSIXct(trunc(Time0, units = "hours"), tz = "UTC"))
# Aggregated by hour
dat_temp_hr <- dat_temp %>%
group_by(Time_hour, Location) %>%
summarize(Temp = mean(Temp, na.rm = TRUE)) %>%
spread(key = Location, value = Temp) %>%
aggr_loc(lab_var = `US Lab`,
node_var = `Node 2`) %>%
gather(Loc, Temp, -Time_hour) %>%
filter(!is.na(Temp))
saveRDS(dat_temp_hr, file = paste0(out.dir, "temp_hr_loc.rds"))
# ppo2_hr.rds
# Read the 2009 file in
dat09 <- read_excel(fname_09,
col_types = c("date", "numeric", "numeric",
"date", rep("numeric", 5))) %>%
slice(3:nrow(.)) %>% # Skip first couple lines as they are empty
select(-3) %>% # Get rid of the 3rd column as it's all missing
# Rename the LAER12FC0972P as Lab MCA
rename(LAER12FC0972P = `Lab MCA`,
# Rename the first column Time1 and the 3rd column Time2
Time1 = `...1`,
Time2 = `...4`) %>%
# Categorize temp sensors by location: US Lab, Node 2, Other
select(Timestamp = Time2,
LAER12FC0972P) %>%
gather(Sensor, ppO2, -Timestamp) %>%
# One row per raw timestamp and sensor
group_by(Timestamp, Sensor) %>%
summarize(ppO2 = mean(ppO2, na.rm = TRUE)) %>%
filter(!is.na(ppO2))
# Read and clean data
data_ppo2 <- function(fname){
print(fname)
dat <- read_excel(fname,
col_types = c("date", rep("numeric", 7))) %>%
slice(3:nrow(.)) %>%
rename(Timestamp = `...1`) %>%
# Categorize temp sensors by location: US Lab, Node 2, Other
select(Timestamp, N3ER12FC0972P, LAER12FC0972P) %>%
gather(Sensor, ppO2, -Timestamp) %>%
group_by(Timestamp, Sensor) %>%
summarize(ppO2 = mean(ppO2, na.rm = TRUE)) %>%
filter(!is.na(ppO2))
return(dat)
}
# Glue all post-2009 data together
out_ppo2 = temp_files %>%
as.list() %>%
map(data_ppo2) %>%
reduce(full_join)
# Glue the 2009 and post-2009 data together
dat_ppo2 <- rbind(dat09, out_ppo2)
# Hourly avereage by sensor
ppo2_hr <- dat_ppo2 %>%
mutate(Time_hour = fastPOSIXct(trunc(Timestamp, units = "hours"), tz = "UTC"))
# Hourly average -- combined sensors
ppo2_hr %>%
group_by(Time_hour, Sensor) %>%
summarize(ppO2 = mean(ppO2, na.rm = TRUE)) %>%
ungroup %>%
spread(Sensor, ppO2) %>%
mutate(ppO2 = ifelse(Time_hour >= as.Date("2001-02-17"), LAER12FC0972P, as.numeric(NA)),
ppO2 = ifelse(Time_hour >= as.Date("2010-09-08"), N3ER12FC0972P, ppO2),
ppO2 = ifelse(Time_hour >= as.Date("2011-05-13"),
rowMeans(select(., N3ER12FC0972P , LAER12FC0972P), na.rm = TRUE), ppO2),
ppO2 = ifelse(Time_hour >= as.Date("2012-01-29"), N3ER12FC0972P, ppO2),
ppO2 = delete_reps(ppO2)) %>%
filter(!is.na(ppO2)) %>%
select(Time_hour, ppO2) %>%
saveRDS(., file = paste0(out.dir, "ppo2_hr.rds"))
# co2_hourly.RDS
co2_files = list.files(co2.dir, pattern = "xlsx", full.names = TRUE)
# Reads in the file in fname and only keeps the columns starting
# at index_start up to a length of index_width
data_co2_pieces <- function(dat, index_start, index_len = 5, ...){
indices = seq(from = index_start, len = index_len)
dat2 <- dat %>%
select(indices)
cnames_temp <- colnames(dat2)[4:index_len] %>%
substr(start = 1, stop = 2) %>%
recode("No" = "Node3", "Ye" = "Year", "La" = "Lab", "N3" = "Node3")
cnames_out <- c("Date", "Col1", "Col2", cnames_temp)
dat2 %>%
`colnames<-`(cnames_out) %>%
# Carry year forward
mutate(Year = zoo::na.locf(Year)) %>%
mutate_at(vars(-Date), as.numeric)
}
# For each sheet, try to get all the data and stack it on each other
data_co2 <- function(fname){
dat <- read_excel(fname, sheet = 1)
# Look for the columns that have "Date" in it
# Store these column indices in date_indices
cnames <- colnames(dat)
date_indices <- grep(x = cnames, pattern = "Date")
# Figure out what all of the other columns are
index_len = date_indices[2] - date_indices[1]
# For each date column, only grab the next 3 columns
dat3 <- date_indices %>%
map(~data_co2_pieces(dat, index_start = ., index_len)) %>%
bind_rows %>%
gather(Sensor, CO2, -Date, -Year) %>%
filter(!is.na(CO2)) %>%
# Date format
mutate(d_days = substr(Date, start = 1, stop = 3),
d_hour = substr(Date, start = 5, stop = 6),
d_minu = substr(Date, start = 8, stop = 9),
d_seco = substr(Date, start = 11, stop = 16)) %>%
mutate_at(vars(d_days, d_hour, d_minu, d_seco), as.numeric) %>%
# Ignore missing values of date - average columns
filter(!is.na(d_days)) %>%
# Calculate the date is Date2
mutate(Date2 = fastPOSIXct(paste0(Year, "-01-01 00:00:00"), tz = "GMT") +
days(d_days - 1) + # Don't count first day
hours(d_hour) + minutes(d_minu) + seconds(d_seco))
# Detect and get rid of incomplete days.
dat3_incomplete <- dat3 %>%
group_by(Year, d_days) %>%
summarize(n1 = min(d_hour), n2 = min(d_minu),
n3 = max(d_hour), n4 = max(d_minu)) %>%
# Get days that started in the middle of the day, or
# ended before the day was over
filter(!(n1 == 0 & n2 == 0 & n3 == 23 & n4 == 59)) %>%
select(Year, d_days)
# Aggregate at the day level now
out <- dat3 %>%
# Remove the rows that had incomplete days
anti_join(dat3_incomplete, by = c("Year", "d_days")) %>%
select(Date = Date2, Sensor, CO2) %>%
distinct()
return(out)
}
# Do the same thing as datCO2 but for the first file
data_co2a <- function(fname){
dat <- read_excel(fname, sheet = 1)
# Get the Date columns
cnames <- colnames(dat)
date_indices <- grep(x = cnames, pattern = "Date")
index_len = date_indices[2] - date_indices[1]
# For each date column, only grab the next 3 columns
dat3 <- date_indices %>%
map(~data_co2_pieces(dat, index_start = ., index_len)) %>%
bind_rows %>%
gather(Sensor, CO2, -Date, -Year) %>%
filter(!is.na(CO2)) %>%
select(Date, Year, Sensor, CO2) %>%
mutate(Time_year = year(Date),
Time_month = month(Date),
Time_day = day(Date),
Time_hour = hour(Date))
# Get rid of incomplete days.
dat3_incomplete <- dat3 %>%
mutate(Hour = hour(Date),
Minute = minute(Date),
Second = second(Date)) %>%
group_by(Time_day, Time_month, Time_year) %>%
mutate(n1 = min(Hour), n2 = min(Minute),
n3 = max(Hour), n4 = max(Minute)) %>%
ungroup %>%
filter(!(n1 == 0 & n2 == 0 & n3 == 23 & n4 == 59)) %>%
arrange(Time_year, Time_month, Time_day) %>%
slice(c(1, nrow(.))) %>%
select(Time_year, Time_month, Time_day)
# Aggregate at the day level now
out <- dat3 %>%
anti_join(dat3_incomplete, by = c("Time_year", "Time_month", "Time_day")) %>%
select(Date, Sensor, CO2) %>%
distinct()
return(out)
}
# First file
dat_co2a <- co2_files[1] %>%
as.list() %>%
map(data_co2a) %>%
bind_rows
# All the other files are formatted similarly
dat_co2b <- co2_files[2:18] %>%
as.list() %>%
map(data_co2) %>%
bind_rows
dat_co2_all <- bind_rows(dat_co2a, dat_co2b) %>%
distinct %>%
filter(!is.na(Date)) %>%
mutate(Time_day = fastPOSIXct(trunc(Date, units = "days"), tz = "UTC"),
Time_hour = fastPOSIXct(trunc(Date, units = "hours"), tz = "UTC"))
saveRDS(dat_co2_all, file.path(dir.path, "co2_raw.RDS"))
readRDS(file.path(dir.path, "co2_raw.RDS")) %>%
select(Date, Sensor, CO2, Time_hour) %>%
distinct %>%
# Take hourly average
group_by(Sensor, Time_hour) %>%
summarize(CO2 = mean(CO2, na.rm = TRUE)) %>%
spread(Sensor, CO2) %>%
# Combine Col1 and Col2
mutate(Col = rowMeans(select(., Col1, Col2), na.rm = TRUE)) %>%
# Indicator for best CO2 measurement
mutate(CO2 = ifelse(Time_hour >= as.Date("2001-02-17"), Lab, as.numeric(NA)),
CO2 = ifelse(Time_hour >= as.Date("2010-09-08"), Node3, CO2),
CO2 = ifelse(Time_hour >= as.Date("2011-05-13"), Col, CO2),
CO2 = ifelse(Time_hour >= as.Date("2012-01-29"), Node3, CO2)) %>%
# Sort by ascending time
select(Time_hour, CO2) %>%
arrange(Time_hour) %>%
saveRDS(., file.path(dir.path, "co2_hourly.RDS"))
# Radiation
# Read in all the noise files in the rad.dir folder
rad_files = list.files(rad.dir, pattern = "xlsx", full.names = TRUE)
rad_files = setdiff(x = rad_files,
y = rad_files[grepl(pattern = "[(]1", rad_files)]) %>%
setdiff(x = .,
y = rad_files[grepl(pattern = "2009-2012 TLD100 RAM data_2", x = rad_files)])
# Read in the data
# Formats it properly and calculates the total Radiation Dose as
# Radiation_Dose = GCRDose + SAADose
data_rad <- function(fname){
dat <- read_excel(fname,
col_types = c("date", "text", "text", "numeric", "numeric")) %>%
select(Date, Instrument, Location,
Radiation_GCRDose = `GCRDose (mGy/d)`,
Radiation_SAADose = `SAADose (mGy/d)`) %>%
# Add GCRDose and SAA Dose
mutate(Radiation_Dose = Radiation_GCRDose + Radiation_SAADose)
return(dat)
}
out_rad = as.list(rad_files) %>%
map(data_rad) %>%
reduce(full_join) %>%
distinct()
out_rad %>%
mutate()
saveRDS(file.path(out.dir, "rad_raw.rds"))
# Aggregate radiation at the date/location level
rad_day = out_rad %>%
mutate(Time_day = as.POSIXct(trunc(Date, units = "days"))) %>%
group_by(Time_day) %>%
# There are multiple instruments/locations, just take the average of these every day
summarize(Radiation_Dose = mean(Radiation_Dose, na.rm = TRUE))
rad_day %>%
saveRDS(., file = paste0(dir.path, "rad_day.rds"))
# noise_hr_noloc.rds
# Read in all the noise files
noise_files = list.files(noise.dir, pattern = "xlsx", full.names = TRUE)
noise_files = noise_files[!grepl("Noise data.xlsx", noise_files)]
# This function reads in the file at fname
data_noise <- function(fname){
# Extract the sensor location from the filename:
# it's the thing that comes after the date
location = basename(fname) %>%
str_split(., "Static ") %>%
`[[`(1) %>% `[`(2) %>%
str_split(., " Raw") %>%
`[[`(1) %>% `[`(1)
# Deal with other cases
if (is.na(location)){
location <- if (grepl(pattern = "NODE 2", fname)){"Node 2"} else {location}
location <- if (grepl(pattern = "JPM", fname)){"JPM"} else {location}
}
location2 <- if (grepl(pattern = "Node 2", location)) {
"Node 2"
} else {
if (grepl(pattern = "US Lab", location)){
"US Lab"
} else {
"Other"
}
}
# Read in the data within the file
dat <- read_excel(fname,
col_types = c("date", "numeric"))
dat2 <- dat %>%
# Rename the column names
`colnames<-`(c("Timestamp", "dBA")) %>%
# Skip the empty rows
filter(!is.na(Timestamp)) %>%
# Add column for location of the sensor
mutate(Orig_Sensor = location,
Loc = location2,
# For noise interpolation
# Morning = 7am until 11pm (16 hours)
# Evening = 11pm - 7am (8 hours)
Noise_Day = ((hour(Timestamp) >= 7) & (hour(Timestamp) < 23)))
return(dat2)
}
# Apply the data_noise() function to everything in noise_files
# And glue it all together
out_noise = as.list(noise_files) %>%
purrr::map(data_noise) %>%
reduce(full_join)
# Aggregate noise at the hour level - no location
out_noise %>%
mutate(Timestamp = as.POSIXct(Timestamp, tz = "UTC"),
Time_hour = as.POSIXct(trunc(Timestamp, units = "hours")),
# Decibels aren't linear so you can't just take the average
# of decibels, you need to take the average of energy
# Energy = 10^(decibels/10)
Energy = 10^(dBA/10)) %>%
group_by(Time_hour, Noise_Day) %>%
summarize(Energy_Mean = mean(Energy, na.rm = TRUE)) %>%
ungroup %>%
mutate(Noise_dBA = 10*log(Energy_Mean, base = 10)) %>%
saveRDS(., file = paste0(dir.path, "noise_hr_noloc.rds"))