forked from USGS-R/drb-inland-salinity-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_targets.R
215 lines (184 loc) · 10.2 KB
/
_targets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
library(targets)
Sys.setenv(
AWS_PROFILE = "dev",
AWS_REGION = 'us-east-1',
AWS_SHARED_CREDENTIALS_FILE = "./credentials"
)
options(tidyverse.quiet = TRUE,
#Use multiprocess on Windows, multicore in container (Linux).
clustermq.scheduler = "multicore")
library(clustermq)
library(tidyverse)
tar_option_set(packages = c("tidyverse", "lubridate",
"rmarkdown","dataRetrieval",
"knitr","leaflet","sf",
"purrr", "sbtools", "terra",
"patchwork", "glue", "nhdplusTools",
"Boruta", "ranger", "vip", "tidymodels",
"doParallel", "fastshap", "pdp",
"reticulate", "shapviz", "scico"),
resources = tar_resources(
aws = tar_resources_aws(bucket = "drb-inland-salinity")),
repository = "aws",
format = "rds",
priority = 0.8
)
source("1_fetch.R")
source("2_process.R")
source("3_visualize.R")
source("4_predict.R")
source("4_predict_plots.R")
dir.create("1_fetch/out/", showWarnings = FALSE)
dir.create("2_process/out/", showWarnings = FALSE)
dir.create("3_visualize/out/", showWarnings = FALSE)
dir.create("3_visualize/log/", showWarnings = FALSE)
dir.create("3_visualize/out/daily_timeseries_png/",showWarnings = FALSE)
dir.create("3_visualize/out/hourly_timeseries_png/",showWarnings = FALSE)
dir.create("3_visualize/out/nhdv2_attr_png/",showWarnings = FALSE)
dir.create("3_visualize/out/nhdv2_attr_png/refined",showWarnings = FALSE)
#Predict phase:
# create directories for each of the following combinations of data splits,
# feature sets and results:
predict_dir <- "4_predict/out"
train_test_other <- c("vip","hypopt")
train_test_splits <- c("temporal","random","spatial")
train_test_features <- c("RF_static", "RF_min_static", "RF_static_dynamic",
"RF_min_static_dynamic","RF_dynamic","RGCN_static_dynamic",
"RGCN_min_static_dynamic","RGCN_dynamic")
train_test_res <- c("pred_obs","spatial_res","monthly_res","annual_res","temporal_res")
rf_xai_plot_types <- c("shap","dependence")
rf_xai_shap_options <- c("seasonal","lulc", "physio")
rf_xai_shap_lulc_options <- c("high_forest", "high_urban")
rf_xai_shap_physio_options <- c("appalachian", "coastal", "interior")
rf_xai_shap_seasonal_options <- c("OND", "JFM", "AMJ", "JAS")
rf_xai_shap_seasonal_suboptions <- c(rf_xai_shap_lulc_options, rf_xai_shap_physio_options)
rf_xai_dep_options <- c("pdp","ice")
#directory paths
p4_dirs <- bind_rows(
expand.grid(predict_dir, train_test_splits, train_test_res, train_test_features),
expand.grid(predict_dir, train_test_splits, train_test_other),
expand.grid(predict_dir, train_test_splits, rf_xai_plot_types[rf_xai_plot_types == "shap"],
train_test_features, rf_xai_shap_options[rf_xai_shap_options == "lulc"], rf_xai_shap_lulc_options),
expand.grid(predict_dir, train_test_splits, rf_xai_plot_types[rf_xai_plot_types == "shap"],
train_test_features, rf_xai_shap_options[rf_xai_shap_options == "physio"], rf_xai_shap_physio_options),
expand.grid(predict_dir, train_test_splits, rf_xai_plot_types[rf_xai_plot_types == "shap"],
train_test_features, rf_xai_shap_options[rf_xai_shap_options == "seasonal"], rf_xai_shap_seasonal_options, rf_xai_shap_seasonal_suboptions),
expand.grid(predict_dir, train_test_splits, rf_xai_plot_types[rf_xai_plot_types == "dependence"],
train_test_features, rf_xai_dep_options)) %>%
mutate(file_path = case_when(is.na(Var4) ~ file.path(Var1, Var2, Var3),
is.na(Var5) ~ file.path(Var1, Var2, Var3, Var4),
is.na(Var6) ~ file.path(Var1, Var2, Var3, Var4, Var5),
is.na(Var7) ~ file.path(Var1, Var2, Var3, Var4, Var5, Var6),
TRUE ~ file.path(Var1, Var2, Var3, Var4, Var5, Var6, Var7))) %>%
pull(file_path)
# create the directories
for(i in seq_along(p4_dirs)){
dir.create(p4_dirs[i], recursive = TRUE, showWarnings = FALSE)
}
dir.create("4_predict/out/XAI_splits", showWarnings = FALSE)
dir.create("4_predict/out/spatial/RGCN", showWarnings = FALSE)
dir.create("4_predict/out/temporal/RGCN", showWarnings = FALSE)
# Define columns of interest for harmonized WQP data
wqp_vars_select <- c("MonitoringLocationIdentifier","MonitoringLocationName","LongitudeMeasure","LatitudeMeasure",
"MonitoringLocationTypeName","OrganizationIdentifier","ActivityStartDate","ActivityStartDateTime",
"ActivityStartTime.Time","ActivityStartTime.TimeZoneCode","CharacteristicName","param_group","param",
"USGSPCode","ActivityMediaName","ResultSampleFractionText","HydrologicCondition","HydrologicEvent",
"resultVal2","resultUnits2","ResultDetectionConditionText","ResultTemperatureBasisText",
"PrecisionValue","ResultStatusIdentifier","final")
# Define water quality major ions of interest
major_ion_names = c("Chloride","Sodium")
# Define hydrologic event types in harmonized WQP data to exclude
omit_wqp_events <- c("Spill","Volcanic action")
# Define USGS specific conductance parameter codes of interest
pcodes_select <- c("00095","90095","00094","90096")
# Define USGS specific conductance parameter code to use for continuous data
parameter <- "00095"
# Define minor HUCs (hydrologic unit codes) that make up the DRB to use in calls to dataRetrieval functions
# Lower Delaware: 0204 subregion (for now, exclude New Jersey Coastal (https://water.usgs.gov/GIS/huc_name.html)
drb_huc8s <- c("02040101","02040102","02040103","02040104","02040105","02040106",
"02040201","02040202","02040203","02040204","02040205","02040206","02040207")
#get the boundary of the DRB based on these HUCs
drb_shp <- drb_huc8s %>%
lapply(.,function(x){
# Create spatial object of huc8 basin
huc8_basin <- suppressMessages(nhdplusTools::get_huc8(id=x))}) %>%
bind_rows() %>%
# create one polygon from multiple huc8 basins
sf::st_union() %>%
sf::st_make_valid()
# Define USGS site types for which to download specific conductance data (include "Stream","Stream:Canal", and "Spring" sites)
site_tp_select <- c("ST","ST-CA","SP")
# Omit undesired sites
# sites 01412350, 01484272, and 01482695 coded as site type "ST" but appear to be tidally-influenced
omit_nwis_sites <- c("01412350","01484272","01482695")
# Define search radius to match sites to PRMS segments (in meters)
bird_dist_cutoff_m <- 500
# Retain NWIS sites even if nearest PRMS segment is outside of radius set by bird_dist_cutoff_m
retain_nwis_sites <- c("01474703","01477050")
# Define PRMS mainstem segments thought to be tidally-influenced (used for filtering sites used in daily/sub-daily models)
mainstem_reaches_tidal <- c("2771_1","2769_1","2768_1","2767_1","2764_1","2762_1","2763_1","2759_1","2757_1","2752_1",
"2753_1","2755_1","2772_1","388_1","389_1","385_1","386_1","383_1","382_1","377_1","378_1",
"376_1","351_1","344_1","346_1","333_1")
# Define the url for the NHGFv1 to NHDv2 crosswalk
# see https://github.com/USGS-R/drb-network-prep/commit/3637931f5a17469a4234eaed3d20ed44ba45958d
GFv1_NHDv2_xwalk_url <- "https://raw.githubusercontent.com/USGS-R/drb-network-prep/3637931f5a17469a4234eaed3d20ed44ba45958d/2_process/out/GFv1_NHDv2_xwalk_omit_zero_area.csv"
# Define the url for the edited HRU polygons
# see https://github.com/USGS-R/drb-network-prep/commit/3637931f5a17469a4234eaed3d20ed44ba45958d
GFv1_HRUs_edited_url <- "https://github.com/USGS-R/drb-network-prep/blob/3637931f5a17469a4234eaed3d20ed44ba45958d/2_process/out/GFv1_catchments_edited.gpkg?raw=true"
# Define USGS stat codes for continuous sites that only report daily statistics (https://help.waterdata.usgs.gov/stat_code)
stat_cd_select <- c("00001","00003")
# Define earliest startDate and latest endDate for NWIS data retrievals
earliest_date <- "1979-10-01"
latest_date <- "2021-12-31"
# Define dataset of interest for the national geospatial fabric (used to fetch PRMS catchment polygons):
gf_data_select <- 'GeospatialFabricFeatures_02.zip'
## NLCD
# path to all NLCD LC data 2001-2019
# Note: this data should be manually downloaded from the project Data folder in sharepoint (`NLCD_final/NLCD_final/`)
NLCD_LC_path <- file.path('1_fetch/in/NLCDs_2001_2019')
dir.create(NLCD_LC_path, showWarnings = F)
# last two digit suffix of years of NLCD data
NLCD_year_suffix <- c('01','04','06','08','11','13','16','19')
# Define land cover datasets to extract
sb_ids_NLCD2011 <- c(
# ImperviousnessPct_2011: 'https://www.sciencebase.gov/catalog/item/57057a9be4b0d4e2b7571fbb',
'57057a9be4b0d4e2b7571fbb',
# Imperviousness100mBufferRipZone:'https://www.sciencebase.gov/catalog/item/570577fee4b0d4e2b7571d7b',
'570577fee4b0d4e2b7571d7b',
# TreeCanopy_100mBuffered_RipZone: 'https://www.sciencebase.gov/catalog/item/570572e2e4b0d4e2b75718bc'
'570572e2e4b0d4e2b75718bc',
# Estimated percent of catchment in 50 meter riparian zone that contains the land-use and land-cover: 'https://www.sciencebase.gov/catalog/item/57855ddee4b0e02680bf37bf'
'57855ddee4b0e02680bf37bf'
)
# Define Land Cover dataset download folders:
NLCD2011_folders <- c(
'ImperviousnessPct_2011',
'Imperviousness100m_RipZone',
'TreeCanopy_100mBuffered_RipZone',
'NLCD_LandCover_50m_RipZone'
)
## NADP_sb_id var
## https://www.sciencebase.gov/catalog/item/57e2ac2fe4b0908250045981
NADP_sb_id <- '57e2ac2fe4b0908250045981'
## FORESCE list of FORESCE years
FORESCE_years <- c('1940', '1950', '1960', '1970', '1980', '1990', '2000')
#Boruta Random Forest Parameters
#maximum number of runs for Boruta feature screening algorithm
Boruta_runs <- 300
#number of trees
Boruta_trees <- 500
#number of cores
Boruta_cores <- 70
#The number of cores used to train RF models
#Note that this number is not used for every target, but it was useful to keep the RAM below 385 GB for all exact one model.
RF_cores <- 50
#Cross validation folds
cv_folds <- 5
#SHAP
SHAP_nsim <- 20
SHAP_RAM <- 6
RAM_set <- 300
SHAP_cores <- 50
# Return the complete list of targets
c(p1_targets_list, p2_targets_list, p3_targets_list, p4_targets_list,
p4_plot_targets_list)