From ca61c3ea033f5d7354ba362b1796e791d795cd1b Mon Sep 17 00:00:00 2001 From: Ilona Hughes Date: Thu, 11 Jul 2024 16:24:53 -0400 Subject: [PATCH 1/8] Daily and weekly drafts --- HARP-2024-2025/hydroimport_daily.R | 49 ++++++++++++++++++++ HARP-2024-2025/make_weekly_summary_ts.R | 61 +++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 HARP-2024-2025/hydroimport_daily.R create mode 100644 HARP-2024-2025/make_weekly_summary_ts.R diff --git a/HARP-2024-2025/hydroimport_daily.R b/HARP-2024-2025/hydroimport_daily.R new file mode 100644 index 00000000..f626c3a5 --- /dev/null +++ b/HARP-2024-2025/hydroimport_daily.R @@ -0,0 +1,49 @@ +# Inputs (args): +# 1 = File path of csv from VA Hydro +# 2 = Data source "nldas2, daymet, prism" +# 3 = End path of new csv +# Outputs: +# Csv file with manipulated data at end filepath + +# Library necessary packages +print("Accessing necessary libraries") +suppressPackageStartupMessages(library(lubridate)) +suppressPackageStartupMessages(library(sqldf)) + +# Set up command args +print("Reading command args") +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 3){ + message("Missing or extra inputs. Usage: Rscript hydroimport_daily.R input_path data_source output_path") + q() +} + +input_path <- args[1] +data_source <- args[2] +output_path <- args[3] + +# Pull csv from input file path +print("Reading csv") +hydro_daily <- read.csv(input_path) +# Add in more date information +print("Adding date information") +hydro_daily[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(hydro_daily$obs_date)), + month(as.Date(hydro_daily$obs_date)), + day(as.Date(hydro_daily$obs_date)), + week(as.Date(hydro_daily$obs_date))) + +# If data comes from nladas2 (hourly), it must be converted into daily data +print("Checking for nldas2") +if (data_source=="nldas2"){ + hydro_daily <- sqldf( + "select featureid, min(obs_date) as obs_date, yr, mo, da, + sum(precip_mm) as precip_mm, sum(precip_in) as precip_in + from hydro_daily + group by yr, mo, da + order by yr, mo, da + " + )} + +# Write csv in new file path +print("Writing csv") +write.csv(hydro_daily,file = output_path) \ No newline at end of file diff --git a/HARP-2024-2025/make_weekly_summary_ts.R b/HARP-2024-2025/make_weekly_summary_ts.R new file mode 100644 index 00000000..56930201 --- /dev/null +++ b/HARP-2024-2025/make_weekly_summary_ts.R @@ -0,0 +1,61 @@ +# Inputs (args): +# 1 = comp data csv filepath +# 2 = Output csv path +# 3 = data column + +# Libraries +suppressPackageStartupMessages(library(lubridate)) +suppressPackageStartupMessages(library(sqldf)) + + +args <- commandArgs(trailingOnly = T) + +if (length(args) != 3){ + message("Missing or extra inputs. Usage: Rscript make_weekly_summary_ts.R comp_data_filepath output_filepath data_column ") +} +# Missing from github issue:[weekly_column(default=weekly_mean_value)] +print("Assigning arguments") +comp_data_filepath <- args[1] +output_filepath <- args[2] +data_column <- args[3] + +print("Reading in comp data csv") +comp_data <- read.csv(comp_data_filepath) + +# Check for necessary data (Comp data doesnt have start and end date?) +print("Checking columns") +#if ("start_date" %in% colnames(comp_data)){}else{ + # message("start_date column missing") + # q() +#} + +#if ("end_date" %in% colnames(comp_data)){}else{ + #message("end_date column missing") + #q() +#} + +if (data_column %in% colnames(comp_data)){}else{ + message(data_column, "column missing") + q() +} + + +#converts our daily data into weekly +print("Converting to weekly data") + week_data <- sqldf( + "select min(obs_date) as week_begin, yr, wk, min(dataset_day) as dataset_day_begin, + avg(dataset_p_in) as dataset_p_in, avg(dataset_cfs) as dataset_cfs, + avg(usgs_cfs) as usgs_cfs, avg(today_d_cfs) as today_d_cfs, + avg(nextday_d_cfs) as nextday_d_cfs + from comp_data + group by yr, wk + order by yr, wk + " + ) + #Base the month column off of the min day oindex for that week, or the day that + #week began + week_data$mo <- month(week_data$week_begin) + + + write.csv(week_data,output_filepath) + From a74051ea5742b69f75182c90c94509a83dd7d7d2 Mon Sep 17 00:00:00 2001 From: Ilona Hughes Date: Mon, 9 Sep 2024 13:25:49 -0400 Subject: [PATCH 2/8] Data source analysis --- HARP-2024-2025/DataSourceEvaluations.R | 147 +++++++++ HARP-2024-2025/StormResults_Analysis.R | 394 +++++++++++++++++++++++++ 2 files changed, 541 insertions(+) create mode 100644 HARP-2024-2025/DataSourceEvaluations.R create mode 100644 HARP-2024-2025/StormResults_Analysis.R diff --git a/HARP-2024-2025/DataSourceEvaluations.R b/HARP-2024-2025/DataSourceEvaluations.R new file mode 100644 index 00000000..14c0c02f --- /dev/null +++ b/HARP-2024-2025/DataSourceEvaluations.R @@ -0,0 +1,147 @@ +# Load in libraries +library(sqldf) +library(ggplot2) +library(tidyverse) + + +################################################################################ +# PRISM List of files + +list_of_prism <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-prism", + recursive = TRUE, + pattern = "\\.csv$", + full.names = TRUE) +prism_list <- read_csv(list_of_prism, id="gageid") + +prism_list$gageid <- grep("0", unlist(strsplit(prism_list$gageid, "_")), value = TRUE, perl = TRUE) +prism_list$gageid <- grep("0", unlist(strsplit(prism_list$gageid, "-")), value = TRUE, perl = TRUE) +prism_list$...1 <- NULL + +prism_gages <- as.list(unique(prism_list$gageid)) + +################################################################################ +# DAYMET List of files + +list_of_daymet <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-daymet/out", + recursive = TRUE, + pattern = "\\.csv$", + full.names = TRUE) +daymet_list <- read_csv(list_of_daymet, id="gageid") + +daymet_list$gageid <- grep("0", unlist(strsplit(daymet_list$gageid, "_")), value = TRUE, perl = TRUE) +daymet_list$gageid <- grep("0", unlist(strsplit(daymet_list$gageid, "-")), value = TRUE, perl = TRUE) +daymet_list$...1 <- NULL + +daymet_gages <- as.list(unique(daymet_list$gageid)) + +################################################################################ +# NLDAS2 List of Files + +list_of_nldas2 <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-daymet/out", + recursive = TRUE, + pattern = "\\.csv$", + full.names = TRUE) +nldas2_list <- read_csv(list_of_nldas2, id="gageid") + +nldas2_list$gageid <- grep("0", unlist(strsplit(nldas2_list$gageid, "_")), value = TRUE, perl = TRUE) +nldas2_list$gageid <- grep("0", unlist(strsplit(nldas2_list$gageid, "-")), value = TRUE, perl = TRUE) +nldas2_list$...1 <- NULL + +nldas2_gages <- as.list(unique(nldas2_list$gageid)) + +################################################################################ +# Find gages which ran for each data source + +complete_gages <- as.list(intersect(prism_gages,daymet_gages)) +complete_gages <- as.list(as.numeric(intersect(complete_gages,nldas2_gages))) + +################################################################################ +# +for(j in c("daymet","prism","nldas")){ + print(j) + #Set path to read in ratings based on data source of outer loop + pathToread <- paste0("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/",j,"/") + for( i in list.files(pathToread) ){ + filei <- read.csv(paste0(pathToread,i)) + #Store the gage number + filei$gage <- gsub(".*_(\\d+)-.*","\\1",i) + #Combine the file into a single data frame + if(!exists("combineFile")){ + combineFile <- filei + }else{ + combineFile <- rbind(combineFile,filei) + } + } + #Assign to a specific variable and delete the generic combineFile + assign(paste0('combineFile',j),combineFile) + rm(combineFile) +} + +#Join the daymet and prism data together +joinData <- combineFileprism %>% + #Remove the row numbers (not important) and rename the r_squared for clarity + select(-X,prismRating = r_squared) %>% + #Join in the dayment data, but first rename the r_squared column for clarity. + #Join on gage and month + left_join(combineFiledaymet %>% + select(daymetRating = r_squared,gage,mo), + by = c("gage","mo")) %>% + #Join in the nldas data, but first rename the r_squared column for clarity. + #Join on gage and month + left_join(combineFilenldas %>% + select(nldasRating = r_squared,gage,mo), + by = c("gage","mo")) %>% + #Pivot it longer to have a column with the data source and one for the + #ratings, for plotting ease + pivot_longer(c(prismRating,daymetRating,nldasRating), + names_to = 'dataSource', + values_to = 'rating') + + + +################################################################################ +# Import and add drainage are from USGS +drainage_area <- dataRetrieval::readNWISsite(c(unique(joinData$gage))) +joinData <- sqldf( + "select a.mo as mo, a.gage as gageid, a.dataSource as dataSource, a.rating as rating, + b.drain_area_va as drainageArea_sqmi + from joinData as a + left outer join drainage_area as b + on ( + a.gage = b.site_no + )" +) + +joinData$dataSource <- gsub("Rating","",joinData$dataSource) + + +################################################################################ +# Trends in DA and ratings + +ggplot(data=joinData)+ + aes(y=mean(joinData$rating, group_by(gageid)),x=joinData$drainageArea_sqmi) + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/HARP-2024-2025/StormResults_Analysis.R b/HARP-2024-2025/StormResults_Analysis.R new file mode 100644 index 00000000..95b5f5ad --- /dev/null +++ b/HARP-2024-2025/StormResults_Analysis.R @@ -0,0 +1,394 @@ +# Key for Gages: +# 01620500 = North River near Stokesville +# 01634500 = Cedar Creek near Winchester +# 01669000 = Piscataway Creek near Tappahannock +# 02021500 = Maury River at Rockbridge Falls +# 02031000 = Mechums River near White Hall +# 02044500 = Nottoway River near Rawlings +# 03176500 = New River at Glen Lyn +# 03165000 = Chesnut Creek at Galax +# 02054530 = Roanoke River at Glenvar +# 02034000 = Rivanna River at Palmyra + +# sql, ggplot2 +library(sqldf) +library(ggplot2) +library(tidyverse) + +############################################################################## +# STORM FILE DATA COMPILATION + +list_of_files <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out", + recursive = TRUE, + pattern = "\\.csv$", + full.names = TRUE) +df <- read_csv(list_of_files, id="gageid") + +df$gageid <- grep("0", unlist(strsplit(df$gageid, "_")), value = TRUE, perl = TRUE) +df$gageid <- grep("0", unlist(strsplit(df$gageid, "-")), value = TRUE, perl = TRUE) +df$...1 <- NULL + +df$season <- ifelse (df$mo %in% c(6:8), + "SUMMER", + ifelse (df$mo %in% c(9:11), + "AUTUMN", + ifelse (df$mo %in% c(12,1,2), + "WINTER", + ifelse (df$mo %in% c(3:5), + "SPRING", NA)))) + +############################################################################## +# MONTHLY AND SEASONAL R-SQUARED PLOTS + +ggplot(data = df)+ + aes(y = season, x=r_squared, color=season)+ + geom_boxplot()+ + xlim(0,1)+ + theme_bw()+ + xlab("R-Squared")+ + ylab("Season")+ + ggtitle("Seasonal R-Squared Values ()")+ + theme(plot.title = element_text(hjust = 0.5), legend.position = "none") + + +ggplot(data = df)+ + aes(y = mo, x=r_squared)+ + geom_boxplot(mapping = aes(group = mo))+ + xlim(0,1)+ + theme_bw()+ + xlab("R-Squared")+ + ylab("Month")+ + ggtitle("Monthly R-Squared Values ()")+ + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")+ + scale_y_continuous(breaks = seq(0,12,1)) + + +############################################################################ +# Highest and lowest avg R-squared + +gage_mean<- aggregate(x= df$r_squared, + # Specify group indicator + by = list(df$gageid), + # Specify function (i.e. mean) + FUN = mean, + na.rm = TRUE ) +print(gage_mean) + +ggplot(data = gage_mean, + mapping = aes(x=x))+ + geom_density(color="dodgerblue3")+ + theme_bw()+ + ylab("Density")+ + xlab("R-Squared")+ + ggtitle("Mean R-Squared Value Density for each Gage")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(0,1)+ + geom_vline(xintercept = mean(gage_mean$x), lwd=0.5,colour="purple4",linetype = "dashed") + +ggplot(data = df, + mapping = aes(x=r_squared))+ + geom_density(color="indianred4")+ + theme_bw()+ + ylab("Density")+ + xlab("R-Squared")+ + ggtitle("Total R-Squared Value Density")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(0,1)+ + geom_vline(xintercept = mean(df$r_squared, na.rm = TRUE), lwd=0.5,colour="deeppink3",linetype = "dashed") + + +############################################################################## +# Highest R-squared Gage analysis + +high_data <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out/usgs_ws_01652500-PRISM-storm_volume-ratings.csv") + +ggplot(data = high_data, + mapping = aes(x=mo,y=r_squared))+ + geom_line(color="magenta4")+ + theme_bw()+ + ylim(0,1)+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("Monthly R-Squared, Highest Mean")+ + theme(plot.title = element_text(hjust = 0.5)) + + +############################################################################# +# Lowest R-squared Gage analysis + +low_data <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out/usgs_ws_01671025-PRISM-storm_volume-ratings.csv") + +ggplot(data = low_data, + mapping = aes(x=mo,y=r_squared))+ + geom_line(color="darkolivegreen3")+ + theme_bw()+ + ylim(-0.1,1)+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("Monthly R-Squared, Lowest Mean")+ + theme(plot.title = element_text(hjust = 0.5)) + +############################################################################# + +precip <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Peak Diff/usgs_ws_02024000-PRISM-daily.csv") +stormflow <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Peak Diff/usgs_ws_02024000-stormevent-flow.csv") + +precip$obs_date <- as_date(precip$obs_date) +stormflow$timestamp <- as_date(stormflow$timestamp) + +stormflow <- subset(stormflow, stormflow$timestamp >= min(precip$obs_date)) +stormflow <- subset(stormflow, stormflow$timestamp <= max(precip$obs_date)) + +compdata <- sqldf( + "select a.obs_date, a.precip_in, + b.flow, b.stormID + from precip as a + left outer join stormflow as b + on ( + a.yr = b.year + and a.obs_date = b.timestamp + )" +) + + + +############################################################################ +# Plot Ratings +ggplot(data=comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_01620500), color="darkred")+ + geom_line(aes(y=r_01629500), color="red3")+ + geom_line(aes(y=r_01634500), color="firebrick1")+ + geom_line(aes(y=r_01643700), color="orangered1")+ + geom_line(aes(y=r_01662800), color="darkorange1")+ + geom_line(aes(y=r_01669000), color="orange1")+ + geom_line(aes(y=r_01669520), color="goldenrod1")+ + geom_line(aes(y=r_02018500), color="yellow2")+ + geom_line(aes(y=r_02021500), color="yellowgreen")+ + geom_line(aes(y=r_02024000), color="palegreen3")+ + geom_line(aes(y=r_02026000), color="darkslategray3")+ + geom_line(aes(y=r_02029000), color="deepskyblue3")+ + geom_line(aes(y=r_02031000), color="dodgerblue4")+ + geom_line(aes(y=r_02032250), color="darkslateblue")+ + geom_line(aes(y=r_02034000), color="purple4")+ + geom_line(aes(y=r_02033500), color="darkorchid4")+ + geom_line(aes(y=r_02051000), color="darkviolet")+ + geom_line(aes(y=r_02054530), color="darkmagenta")+ + geom_line(aes(y=r_03165000), color="deeppink3")+ + geom_line(aes(y=r_03176500), color="hotpink")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + ylim(0,1) + + + +ggplot(data=comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=mean(r_01620500)), color="darkred")+ + geom_line(aes(y=mean(r_01629500)), color="red3")+ + geom_line(aes(y=mean(r_01634500)), color="firebrick1")+ + geom_line(aes(y=mean(r_01643700)), color="orangered1")+ + geom_line(aes(y=mean(r_01662800)), color="darkorange1")+ + geom_line(aes(y=mean(r_01669000)), color="orange1")+ + geom_line(aes(y=mean(r_01669520)), color="goldenrod1")+ + geom_line(aes(y=mean(r_02018500)), color="yellow2")+ + geom_line(aes(y=mean(r_02021500)), color="yellowgreen")+ + geom_line(aes(y=mean(r_02024000)), color="palegreen3")+ + geom_line(aes(y=mean(r_02026000)), color="darkslategray3")+ + geom_line(aes(y=mean(r_02029000)), color="deepskyblue3")+ + geom_line(aes(y=mean(r_02031000)), color="dodgerblue4")+ + geom_line(aes(y=mean(r_02032250)), color="darkslateblue")+ + geom_line(aes(y=mean(r_02034000)), color="purple4")+ + geom_line(aes(y=mean(r_02033500)), color="darkorchid4")+ + geom_line(aes(y=mean(r_02051000)), color="darkviolet")+ + geom_line(aes(y=mean(r_02054530)), color="darkmagenta")+ + geom_line(aes(y=mean(r_03165000)), color="deeppink3")+ + geom_line(aes(y=mean(r_03176500)), color="hotpink")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + ylim(0.25,0.75) + + +ggplot(data=comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=mean(r_03176500)), color="hotpink") + + +ggplot(data = comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_01620500), color="red3")+ + geom_line(aes(y=r_01634500), color="orange2")+ + geom_line(aes(y=r_01669000), color="gold1")+ + geom_line(aes(y=r_02021500), color="yellowgreen")+ + geom_line(aes(y=r_02031000), color="darkturquoise")+ + geom_line(aes(y=r_02034000), color="dodgerblue")+ + geom_line(aes(y=r_02044500), color="dodgerblue4")+ + geom_line(aes(y=r_02054530), color="purple3")+ + geom_line(aes(y=r_03165000), color="orchid3")+ + geom_line(aes(y=r_03176500), color="hotpink")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + geom_rect(aes(xmin = 6, xmax = 8, ymin = 0.025, ymax = 0.55), + fill="transparent", color="red", size=0.75)+ + geom_rect(aes(xmin = 3, xmax = 5, ymin = 0.55, ymax = 0.85), + fill="transparent", color="red", size=0.75)+ + geom_rect(aes(xmin = 9.25, xmax = 10.75, ymin = 0.4, ymax = 0.85), + fill="transparent", color="red", size=0.75)+ + geom_rect(aes(xmin = 1.5, xmax = 2.5, ymin = 0.45, ymax = 0.85), + fill="transparent", color="red", size=0.75)+ + ylim(0,1) + + +# July Dip (> 5) +ggplot(data = comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_01620500), color="red3")+ + geom_line(aes(y=r_01634500), color="orange2")+ + geom_line(aes(y=r_02021500), color="yellowgreen")+ + geom_line(aes(y=r_02044500), color="dodgerblue4")+ + geom_line(aes(y=r_02054530), color="purple3")+ + geom_line(aes(y=r_03165000), color="orchid3")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("July Decrease in Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(5,9)+ + ylim(0,1) + +# April Peak (not > 5) +ggplot(data = comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_02021500), color="yellowgreen")+ + geom_line(aes(y=r_02034000), color="dodgerblue")+ + geom_line(aes(y=r_02054530), color="purple3")+ + geom_line(aes(y=r_03165000), color="orchid3")+ + geom_line(aes(y=r_03176500), color="hotpink")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("April Peak Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(2,6)+ + ylim(0,1) + + +# February Peak (not > 5) +ggplot(data = comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_01669000), color="gold1")+ + geom_line(aes(y=r_02031000), color="darkturquoise")+ + geom_line(aes(y=r_02054530), color="purple3")+ + geom_line(aes(y=r_03176500), color="hotpink")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("February Peak Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(1,4)+ + ylim(0,1) + + +# October Peak (not > 5) +ggplot(data = comp_ratings, + mapping = aes(x=mo))+ + geom_line(aes(y=r_01620500), color="red3")+ + geom_line(aes(y=r_02021500), color="yellowgreen")+ + geom_line(aes(y=r_02031000), color="darkturquoise")+ + geom_line(aes(y=r_02044500), color="dodgerblue4")+ + geom_line(aes(y=r_03165000), color="orchid3")+ + theme_bw()+ + xlab("Month")+ + ylab("R-Squared")+ + ggtitle("October Peak Monthly R-Squared")+ + theme(plot.title = element_text(hjust = 0.5))+ + xlim(8,12)+ + ylim(0,1) + + +# Data frame with monthly mean r-squared value +monthly_mean <- data.frame(rowMeans(comp_ratings[,2:11])) +monthly_mean$mo <- c(1:12) +monthly_mean$season <- ifelse (monthly_mean$mo %in% c(6:8), + "SUMMER", + ifelse (monthly_mean$mo %in% c(9:11), + "AUTUMN", + ifelse (monthly_mean$mo %in% c(12,1,2), + "WINTER", + ifelse (monthly_mean$mo %in% c(3:5), + "SPRING", NA)))) +monthly_mean <- monthly_mean[, c(2,1)] +colnames(monthly_mean)[2] <- "mean_r_squared" + + + + +# Add season to comp data +comp_ratings$season <- ifelse (comp_ratings$mo %in% c(6:8), + "SUMMER", + ifelse (comp_ratings$mo %in% c(9:11), + "AUTUMN", + ifelse (comp_ratings$mo %in% c(12,1,2), + "WINTER", + ifelse (comp_ratings$mo %in% c(3:5), + "SPRING", NA)))) + +library(ggbeeswarm) + +ggplot(data = pivot_longer(comp_ratings,2:21))+ + aes(y = season, x=value, color=season)+ + geom_boxplot()+ + xlim(0,1)+ + theme_bw()+ + xlab("R-Squared")+ + ylab("Season")+ + ggtitle("Seasonal R-Squared Values")+ + theme(plot.title = element_text(hjust = 0.5), legend.position = "none") + + +ggplot(data = pivot_longer(comp_ratings,2:21))+ + aes(y = mo, x=value)+ + geom_boxplot(mapping = aes(group = mo))+ + xlim(0,1)+ + theme_bw()+ + xlab("R-Squared")+ + ylab("Month")+ + ggtitle("Monthly R-Squared Values")+ + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")+ + scale_y_continuous(breaks = seq(0,12,1)) + + + + +# Plot monthly average rsquared value +ggplot(data = monthly_mean, + mapping = aes(x=mo))+ + geom_line(aes(y=mean_r_squared), color="hotpink3")+ + theme_bw()+ + xlab("Month")+ + ylab("Mean R-Squared")+ + ggtitle("Monthly Average R-Squared Values")+ + theme(plot.title = element_text(hjust = 0.5))+ + ylim(0,1) + + + + + + + + + + + + + From b5b874e688cc0eca0689c8b4eafc3eb1597d5704 Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Mon, 7 Oct 2024 12:09:47 -0400 Subject: [PATCH 3/8] VA hydro analytics (unfinished) --- HARP-2024-2025/basic_analytics_for_vahydro.R | 184 +++++++++++++++++++ HARP-2024-2025/storm_event_ratings.R | 39 ++++ HARP-2024-2025/using_basic_analytics.R | 14 ++ 3 files changed, 237 insertions(+) create mode 100644 HARP-2024-2025/basic_analytics_for_vahydro.R create mode 100644 HARP-2024-2025/storm_event_ratings.R create mode 100644 HARP-2024-2025/using_basic_analytics.R diff --git a/HARP-2024-2025/basic_analytics_for_vahydro.R b/HARP-2024-2025/basic_analytics_for_vahydro.R new file mode 100644 index 00000000..f4782636 --- /dev/null +++ b/HARP-2024-2025/basic_analytics_for_vahydro.R @@ -0,0 +1,184 @@ +#Creates Summary analytic functions for workflow +#Use for VA HYDRO metrics +library(tidyr) +library(sqldf) + +## For PRISM data + +prism_summary_analytics <- function(df){ +df <- + separate (data = df, + col = obs_date, + into = c("obs_date","obs_time"), + sep = " ") +df <- + separate (data = df, + col = obs_date, + into = c("obs_year","obs_month","obs_day"), + sep = "-") + +#creating a yearly summary with each year and its total precip +yearly.summary <- + sqldf( + "SELECT obs_year, SUM(precip_in) AS total_precip + FROM df + GROUP BY obs_year" +) + +#summary analytics + +precip_annual_max_in <- + max(yearly.summary$total_precip) + +precip_annual_max_year <- + yearly.summary$obs_year[which.max(yearly.summary$total_precip)] + +precip_annual_mean_in <- + mean(yearly.summary$total_precip) + + +#For min values and years, we can exclude the last row since the current year +#is incomplete data + +precip_annual_min_in <- + min(yearly.summary$total_precip[-nrow(yearly.summary)]) +precip_annual_min_year <- + yearly.summary$obs_year[which.min + (yearly.summary$total_precip[-nrow + (yearly.summary)])] + + +precip_daily_max_in <- max(df$precip_in) +precip_hourly_max_in <- max(df$precip_in)/24 +#precip_hourly_max_in done using interpolation (not super realistic). +#Alternatively, we could look at the rainfall distribution table for a +#type II storm and multiply by the max P(t)/P(24) value. + +#not sure about last metric, 190_precip_in. +print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, + precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, + precip_hourly_max_in)) #move into dataframe with 7 columns and 1 row +} + +# For Daymet + +daymet_summary_analytics <- function(df){ + df <- + separate (data = df, + col = obs_date, + into = c("obs_date","obs_time"), + sep = " ") + df <- + separate (data = df, + col = obs_date, + into = c("obs_year","obs_month","obs_day"), + sep = "-") + + #creating a yearly summary with each year and its total precip + yearly.summary <- + sqldf( + "SELECT obs_year, SUM(precip_in) AS total_precip + FROM df + GROUP BY obs_year" + ) + + #summary analytics + + precip_annual_max_in <- + max(yearly.summary$total_precip) + + precip_annual_max_year <- + yearly.summary$obs_year[which.max(yearly.summary$total_precip)] + + precip_annual_mean_in <- + mean(yearly.summary$total_precip) + + + #For min values and years, we can exclude the last row since the current year + #is incomplete data + + precip_annual_min_in <- + min(yearly.summary$total_precip[-nrow(yearly.summary)]) + precip_annual_min_year <- + yearly.summary$obs_year[which.min + (yearly.summary$total_precip[-nrow + (yearly.summary)])] + + + precip_daily_max_in <- max(df$precip_in) + precip_hourly_max_in <- max(df$precip_in)/24 + #precip_hourly_max_in done using interpolation (not super realistic). + #Alternatively, we could look at the rainfall distribution table for a + #type II storm and multiply by the max P(t)/P(24) value. + + #not sure about last metric, 190_precip_in. + print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, + precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, + precip_hourly_max_in)) +} + +# For nldas2 + +nldas2_summary_analytics <- function(df){ + df <- + separate (data = df, + col = obs_date, + into = c("obs_date","obs_time"), + sep = " ") + df <- + separate (data = df, + col = obs_date, + into = c("obs_year","obs_month","obs_day"), + sep = "-") + + #creating a yearly summary with each year and its total precip + yearly.summary <- + sqldf( + "SELECT obs_year, SUM(precip_in) AS total_precip + FROM df + GROUP BY obs_year" + ) + + #summary analytics + + precip_annual_max_in <- + max(yearly.summary$total_precip) + + precip_annual_max_year <- + yearly.summary$obs_year[which.max(yearly.summary$total_precip)] + + precip_annual_mean_in <- + mean(yearly.summary$total_precip) + + + #For min values and years, we can exclude the last row since the current year + #is incomplete data + + precip_annual_min_in <- + min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) + precip_annual_min_year <- + yearly.summary$obs_year[which.min + (yearly.summary$total_precip[c(-nrow + (yearly.summary),-1)])] + daily.summary <- + sqldf( + "SELECT obs_year, obs_month, obs_day, SUM(precip_in) AS total_precip + FROM df + GROUP BY obs_year, obs_month, obs_day" + ) + + precip_daily_max_in <- max(daily.summary$total_precip[-1]) + + precip_hourly_max_in <- max(df$precip_in) + + + #not sure about last metric, 190_precip_in. + print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, + precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, + precip_hourly_max_in)) +} + + +metric <- c("precip_annual_max_in", 'precip_annual_max_year', 'precip_annual_mean_in', + 'precip_annual_min_in', 'precip_annual_min_year', 'precip_hourly_max_in', + 'precip_daily_max_in') \ No newline at end of file diff --git a/HARP-2024-2025/storm_event_ratings.R b/HARP-2024-2025/storm_event_ratings.R new file mode 100644 index 00000000..fd00f31a --- /dev/null +++ b/HARP-2024-2025/storm_event_ratings.R @@ -0,0 +1,39 @@ +## Set working directory +setwd("C:/Users/natef/OneDrive - Virginia Tech/HARP") + +#get packages +library(ggplot2) +library(sqldf) +library(tidyr) + +#set varibales for stream gage id +gage_id <- "02026000" + + +#download stormeventflow and rating values +stormflow <- read.csv(paste("http://deq1.bse.vt.edu:81/met/stormVol_nldas2/flow/usgs_ws_",gage_id,"-stormevent-flow.csv",sep = "")) +ratings0 <- read.csv(paste("http://deq1.bse.vt.edu:81/met/nldas2/out/usgs_ws_",gage_id,"-nldas2-lm_simple-rating-ts.csv",sep = "")) + +#clean up ratings dates +ratings <- ratings0 %>% + separate(start_date, into = c("timestamp","start_time"), sep = " ") + +#sql join +# Perform the join +result <- sqldf("SELECT stormflow.timestamp, stormflow.stormID, ratings.rating + FROM stormflow + INNER JOIN ratings ON stormflow.timestamp = ratings.timestamp") +is.na(result$stormID) + +plot <- ggplot(data = result, + mapping = aes(x = !is.na(stormID), y = rating))+ + geom_boxplot()+ + xlab("Storm Event?")+ + ggtitle("Comparison of Ratings During Storm Events and Outside of Storm Events")+ + theme_bw()+ + ylim(c(0,1)) + +plot + + + diff --git a/HARP-2024-2025/using_basic_analytics.R b/HARP-2024-2025/using_basic_analytics.R new file mode 100644 index 00000000..02b0a617 --- /dev/null +++ b/HARP-2024-2025/using_basic_analytics.R @@ -0,0 +1,14 @@ +source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-2024-2025/basic analytics for vahydro.R") + +prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") +prism.summary <- prism_summary_analytics(prism) + +daymet <- read.csv("http://deq1.bse.vt.edu:81/met/daymet/precip/usgs_ws_01613900-daymet-all.csv") +daymet.summary <- daymet_summary_analytics(daymet) + +nldas2 <- read.csv("http://deq1.bse.vt.edu:81/met/nldas2/precip/usgs_ws_01613900-nldas2-all.csv") +nldas2.summary <- nldas2_summary_analytics(nldas2) + +analytics <- data.frame(metric, prism.summary)#, daymet.summary, nldas2.summary) + +precip_annual_max_in <- analytics$prism.summary[1] From d8d883dbc0a2bba9bf9ccaf001d95b2556d5040e Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Fri, 25 Oct 2024 10:03:52 -0400 Subject: [PATCH 4/8] 10.25 metrics commit --- HARP-2024-2025/GIS script.R | 140 +++++++++++++++++++ HARP-2024-2025/basic_analytics_for_vahydro.R | 102 +++++++++----- HARP-2024-2025/caching function.R | 33 +++++ HARP-2024-2025/retrieve_vahydro_metrics.R | 12 ++ HARP-2024-2025/using_basic_analytics.R | 36 ++++- HARP-2024-2025/variable definitions.R | 6 + 6 files changed, 288 insertions(+), 41 deletions(-) create mode 100644 HARP-2024-2025/GIS script.R create mode 100644 HARP-2024-2025/caching function.R create mode 100644 HARP-2024-2025/retrieve_vahydro_metrics.R create mode 100644 HARP-2024-2025/variable definitions.R diff --git a/HARP-2024-2025/GIS script.R b/HARP-2024-2025/GIS script.R new file mode 100644 index 00000000..fbcb3196 --- /dev/null +++ b/HARP-2024-2025/GIS script.R @@ -0,0 +1,140 @@ +library(tidyverse) +#Assumes all ratings files are in a folder ratings/daymet/ or ratings/prism/ +for(j in c("daymet","prism","nldas")){ + print(j) + #Set path to read in ratings based on data source of outer loop + pathToread <- paste0("C:/Users/natef/OneDrive - Virginia Tech/HARP/Ratings/",j,"/") + for( i in list.files(pathToread) ){ + #For each file in the directory, read in the file i as a csv + filei <- read.csv(paste0(pathToread,i)) + #Store the gage number + filei$gage <- gsub(".*_(\\d+)-.*","\\1",i) + #Store the analysis type + filei$workflow <- gsub(".*_","",j) + #Keep only the necessary columns, depending on the workflow: + if(filei$workflow[1] == "simplelm"){ + filei <- filei[,c("mo","rating","gage","workflow")] + }else{ + filei <- filei[,c("mo","r_squared","gage","workflow")] + } + names(filei) <- c("mo","rating","gage","workflow") + #Combine the file into a single data frame + if(!exists("combineFile")){ + combineFile <- filei + }else{ + combineFile <- rbind(combineFile,filei) + } + } + #Assign to a specific variable and delete the generic combineFile + assign(paste0('combineFile',j),combineFile) + rm(combineFile) +} + +#Join the daymet and prism data together +joinData <- combineFileprism %>% + #Rename the rating for clarity + select(prismRatingstorm = rating,gage,mo,workflow) %>% + #Join in the dayment data, but first rename the rating column for clarity. + #Join on gage, month, and workflow + full_join(combineFiledaymet %>% + select(daymetRatingstorm = rating,gage,mo,workflow), + by = c("gage","mo","workflow")) %>% + full_join(combineFiledaymet %>% + select(daymetRatinglm = rating,gage,mo,workflow), + by = c("gage","mo","workflow")) %>% + #Add remaining prism data: + full_join(combineFileprism %>% + select(prismRatinglm = rating,gage,mo,workflow), + by = c("gage","mo","workflow")) %>% + #Join in the nldas data, but first rename the rating column for clarity. + #Join on gage, month, and workflow + full_join(combineFilenldas %>% + select(nldasRatingstorm = rating,gage,mo,workflow), + by = c("gage","mo","workflow")) %>% + full_join(combineFilenldas %>% + select(nldasRatinglm = rating,gage,mo,workflow), + by = c("gage","mo","workflow")) %>% + #For easier viewing, combine lm and storm columns such that there is only one + #column for prism, daymet, nldas classified by the workflow column + mutate(prismRating = coalesce(prismRatingstorm,prismRatinglm), + daymetRating = coalesce(daymetRatingstorm,daymetRatinglm), + nldasRating = coalesce(nldasRatingstorm,nldasRatinglm) + ) %>% + #Remove now superflous columns: + select(-prismRatingstorm,-prismRatinglm, + -daymetRatingstorm,-daymetRatinglm, + -nldasRatingstorm,-nldasRatinglm) %>% + #Pivot it longer to have a column with the data source and one for the + #ratings, for plotting ease + pivot_longer(c(prismRating,daymetRating,nldasRating), + names_to = 'dataSource', + values_to = 'rating') %>% + arrange(gage,mo,workflow) + +#At each gage, does the best performing data source change between workflows? +#The below code is for ESTIMATES ONLY. The left_join assumes that the ratings +#are unique between datasources for each gage, workflow, month. This is brittle +#and could result in incorrect answers! + +safe_max <- function(x) { + if (length(na.omit(x)) == 0) { + return(NA) # or any other placeholder you prefer + } else { + return(max(x, na.rm = TRUE)) + } +} + +gageCompare <- joinData %>% dplyr::ungroup() %>% + #Group by gage, workflow, and month and find the max rating: + dplyr::group_by(gage,workflow,mo) %>% + dplyr::summarise(maxRating = mean(safe_max(rating))) %>% + #Join the joinData df back in matching by gage, workflow, mo, and rating. This + #could be problematic with duplicate ratings as in a case where all ratings + #across the data sources are the same value + left_join(joinData,by = c('gage','workflow','mo','maxRating' = 'rating')) %>% + #Now pivot wider such that we can see ratings and best data sources side-by-side + pivot_wider(names_from = workflow,values_from = c(maxRating,dataSource)) %>% + #Now filter to only find instances where the best data sources are different: + filter(dataSource != dataSource) %>% + #Create a column showing the difference in rating and arrange by the difference + mutate(differenceInRating = maxRating - maxRating) %>% + arrange(differenceInRating) + + + +#Isolate one month +gageCompareMonth <- gageCompare[gageCompare$mo == 10,] + +library(sf) +#Get the watershed coverage from the server +watershedGeo <- read.csv("http://deq1.bse.vt.edu:81/met/usgsGageWatershedGeofield.csv") + +#Get the gage numbers as their own field and store a copy of the data +gageWatershedSF <- watershedGeo +gageWatershedSF$gage <- gsub(".*_(\\d+)","\\1",gageWatershedSF$hydrocode) + +#Let's begin by plotting October daymet ratings for stormVol +forPlot <- joinData[joinData$dataSource == 'daymetRating' & + joinData$workflow == 'stormvol' & + joinData$mo == 10,] +#Join the geometry data onto out plot data +joinDataSF <- forPlot %>% + left_join(gageWatershedSF %>% select(-hydrocode), + by = 'gage') %>% + filter(!is.na(wkt)) +#Create an SF object. Specify the coordinate system and the field name in the +#data frame that contains the well-known text. In this case, WKT is the name of +#the field with the polygon geometries +joinDataSF <- st_as_sf(joinDataSF,wkt = 'wkt',crs = 4326) +#Repair broken geometries +joinDataSF <- st_make_valid(joinDataSF) +#Add shape area in coordinate system units (likely meaningless in crs 4326) +joinDataSF$area <- st_area(joinDataSF) +#Order the data by largest to smallest area to make plotting more effective +joinDataSF <- joinDataSF[order(joinDataSF$area,decreasing = TRUE),] +#Plot SF polygons +ggplot(joinDataSF) + + geom_sf(aes(fill = rating)) + + scale_fill_viridis_c(option = 'magma') + + theme_classic() + diff --git a/HARP-2024-2025/basic_analytics_for_vahydro.R b/HARP-2024-2025/basic_analytics_for_vahydro.R index f4782636..44f39439 100644 --- a/HARP-2024-2025/basic_analytics_for_vahydro.R +++ b/HARP-2024-2025/basic_analytics_for_vahydro.R @@ -2,27 +2,19 @@ #Use for VA HYDRO metrics library(tidyr) library(sqldf) +library(zoo) -## For PRISM data -prism_summary_analytics <- function(df){ -df <- - separate (data = df, - col = obs_date, - into = c("obs_date","obs_time"), - sep = " ") -df <- - separate (data = df, - col = obs_date, - into = c("obs_year","obs_month","obs_day"), - sep = "-") +## For ANY data + +summary_analytics <- function(df){ #creating a yearly summary with each year and its total precip yearly.summary <- sqldf( - "SELECT obs_year, SUM(precip_in) AS total_precip + "SELECT yr, SUM(precip_in) AS total_precip FROM df - GROUP BY obs_year" + GROUP BY yr" ) #summary analytics @@ -31,35 +23,80 @@ precip_annual_max_in <- max(yearly.summary$total_precip) precip_annual_max_year <- - yearly.summary$obs_year[which.max(yearly.summary$total_precip)] + yearly.summary$yr[which.max(yearly.summary$total_precip)] precip_annual_mean_in <- mean(yearly.summary$total_precip) -#For min values and years, we can exclude the last row since the current year -#is incomplete data - +#For min values and years, we can exclude the first and last row since the +#current year and first years are incomplete data precip_annual_min_in <- - min(yearly.summary$total_precip[-nrow(yearly.summary)]) + min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) precip_annual_min_year <- - yearly.summary$obs_year[which.min - (yearly.summary$total_precip[-nrow - (yearly.summary)])] + yearly.summary$yr[which.min + (yearly.summary$total_precip[c(-nrow + (yearly.summary),-1)])] + + + +# Create daily summary to use for all data. This makes hourly data daily sums. +daily.summary <- + sqldf( + "SELECT yr, mo, da, obs_date, SUM(precip_in) AS total_precip + FROM df + GROUP BY yr, mo, da" + ) #include obs_date after SQL to data frame +precip_daily_max_in <- max(daily.summary$total_precip) +#move to beginning -precip_daily_max_in <- max(df$precip_in) -precip_hourly_max_in <- max(df$precip_in)/24 -#precip_hourly_max_in done using interpolation (not super realistic). -#Alternatively, we could look at the rainfall distribution table for a +#if else statement evaluates the amount of unique hours and if 24, +#then hourly max is taken. If not, hourly max is NA +if(length(unique(df$hr)) == 24){ + precip_hourly_max_in <- max(df$precip_in) +} else { + precip_hourly_max_in <- NA +} +#Alternatively to a null value for hourly precip in daily data, +#we could look at the rainfall distribution table for a #type II storm and multiply by the max P(t)/P(24) value. +#this function uses NULL for daily data + + +#l90 using zoo package +l90_precip_in_roll <- + min(rollapply(daily.summary$total_precip, width = 90, FUN = mean, + fill = NA, align = "right"), + na.rm = TRUE) + +#l90 using IHA +if(length(unique(df$hr)) == 24){ + Qout_zoo <- zoo(as.numeric(daily.summary$total_precip), order.by = as.POSIXct(daily.summary$obs_date)) + Qout_g2 <- data.frame(group2(Qout_zoo)) + l90_precip_in <- min(Qout_g2$X90.Day.Min) +} else { + Qout_zoo <- zoo(as.numeric(df$precip_in), order.by = as.POSIXct(df$obs_date)) + Qout_g2 <- data.frame(group2(Qout_zoo)) + l90_precip_in <- min(Qout_g2$X90.Day.Min) +} + -#not sure about last metric, 190_precip_in. -print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, - precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, - precip_hourly_max_in)) #move into dataframe with 7 columns and 1 row + +#makes data frame with all 9 metrics +metrics<- data.frame(precip_annual_max_in = precip_annual_max_in, + precip_annual_max_year = precip_annual_max_year, + precip_annual_mean_in = precip_annual_mean_in, + precip_annual_min_in = precip_annual_min_in, + precip_annual_min_year = precip_annual_min_year, + precip_daily_max_in = precip_daily_max_in, + precip_hourly_max_in = precip_hourly_max_in, + l90_precip_in = l90_precip_in, + l90_precip_in_roll = l90_precip_in_roll) } + +#----Ignore old functions # For Daymet daymet_summary_analytics <- function(df){ @@ -177,8 +214,3 @@ nldas2_summary_analytics <- function(df){ precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, precip_hourly_max_in)) } - - -metric <- c("precip_annual_max_in", 'precip_annual_max_year', 'precip_annual_mean_in', - 'precip_annual_min_in', 'precip_annual_min_year', 'precip_hourly_max_in', - 'precip_daily_max_in') \ No newline at end of file diff --git a/HARP-2024-2025/caching function.R b/HARP-2024-2025/caching function.R new file mode 100644 index 00000000..36a3b20b --- /dev/null +++ b/HARP-2024-2025/caching function.R @@ -0,0 +1,33 @@ +# set up a caching function for NWIS +Sys.setenv(USGS_cache_dir = "/media/model/usgs") + +hydro_tools <- 'https://raw.githubusercontent.com/HARPgroup/hydro-tools/master' +cbp6_location <- 'https://raw.githubusercontent.com/HARPgroup/cbp6/master' +hydro_tools_location <- hydro_tools +om_location <- "https://raw.githubusercontent.com/HARPgroup/om/master" +elfgen_location <- "https://raw.githubusercontent.com/HARPgroup/elfgen/master" +openmi_om_location <- "https://raw.githubusercontent.com/HARPgroup/openmi-om/master" +vahydro_location <- "https://github.com/HARPgroup/vahydro/blob/master" +github_location <- "C:/usr/local/home/git" +HARParchive_location <- paste0(github_location, "/HARParchive") +# foundation_location <- "C:\\Users\\faw18626\\OneDrive - Commonwealth of Virginia" +foundation_location <- paste0(Sys.getenv("USERPROFILE"),"\\OneDrive - Commonwealth of Virginia\\OWS\\foundation_datasets\\wsp\\wsp2020") +onedrive_location <- paste0(Sys.getenv("USERPROFILE"),"\\OneDrive - Commonwealth of Virginia") + +folder <- '/Workspace/tmp/' +export_path <- '/Workspace/tmp/' +base_url <- 'http://deq1.bse.vt.edu:81/d.dh' #needed for REST functions +save_url <- "http://deq1.bse.vt.edu:81/data/proj3/out"; +noaa_api_key <- 'OdnQxMwsNwbNTPiFJRygEwxflkOIrxhh' +options(noaakey = noaa_api_key) + +#### Old Stuff No Longer Needed? +vahydro_directory <- "/usr/local/home/git/r-dh-ecohydro/Analysis/fn_vahydro-2.0" +auth_directory <- "/usr/local/home/git/r-dh-ecohydro/ELFGEN/internal" +file_directory <- "/Workspace/tmp" +save_directory <- "/Workspace/tmp" +fxn_locations <- "/usr/local/home/git/r-dh-ecohydro/ELFGEN/internal/" +habitat_files <- "/usr/local/home/git/r-dh-ecohydro/Analysis/Habitat/" +fxn_vahydro <- "/usr/local/home/git/hydro-tools" +base_directory <- "/usr/local/home/git/r-dh-ecohydro" + diff --git a/HARP-2024-2025/retrieve_vahydro_metrics.R b/HARP-2024-2025/retrieve_vahydro_metrics.R new file mode 100644 index 00000000..2c4b9ecf --- /dev/null +++ b/HARP-2024-2025/retrieve_vahydro_metrics.R @@ -0,0 +1,12 @@ +# GET VAHydro 1.0 RIVERSEG l90_Qout DATA +mdf <- data.frame( + 'model_version' = c('cbp-6.1'), + 'runid' = c('stormVol_prism'), + 'metric' = c('precip_annual_max_in'), + 'runlabel' = c('precip_annual_max_in') +) +met_data <- om_vahydro_metric_grid( + metric = metric, runids = mdf, bundle="watershed", ftype="usgs_full_drainage", + base_url = paste(site,'entity-model-prop-level-export',sep="/"), + ds = ds +) diff --git a/HARP-2024-2025/using_basic_analytics.R b/HARP-2024-2025/using_basic_analytics.R index 02b0a617..1d16c2ca 100644 --- a/HARP-2024-2025/using_basic_analytics.R +++ b/HARP-2024-2025/using_basic_analytics.R @@ -1,14 +1,38 @@ -source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-2024-2025/basic analytics for vahydro.R") +source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-2024-2025/basic_analytics_for_vahydro.R") prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") -prism.summary <- prism_summary_analytics(prism) +prism_summary <- summary_analytics(prism) + daymet <- read.csv("http://deq1.bse.vt.edu:81/met/daymet/precip/usgs_ws_01613900-daymet-all.csv") -daymet.summary <- daymet_summary_analytics(daymet) +daymet_summary <- summary_analytics(daymet) nldas2 <- read.csv("http://deq1.bse.vt.edu:81/met/nldas2/precip/usgs_ws_01613900-nldas2-all.csv") -nldas2.summary <- nldas2_summary_analytics(nldas2) +nldas2_summary <- summary_analytics(nldas2) #issues with l90 -- zoo unique values + +#pull any value like this: + +prism_l90_precip_in <- prism_summary$l90_precip_in +daymet_l90_precip_in <- daymet_summary$l90_precip_in +nldas2_l90_precip_in <- nldas2_summary$l90_precip_in + +#Or even without the summary: + +summary_analytics(prism)$l90_precip_in + +visualizations <- function(prism.metric, daymet.metric, nldas2.metric, title = NULL){ +ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), +y = c(prism.metric, daymet.metric, nldas2.metric)))+ + geom_bar(stat = "identity", fill = "lightblue3")+ + labs(title = title, y = "precip (in)", x = "model")} + +visualizations(prism.metric = prism_summary$precip_annual_max_in, + daymet.metric = daymet_summary$precip_annual_max_in, + nldas2.metric = nldas2_summary$precip_annual_max_in, + title = "yearly max precip") +visualizations(prism.metric = prism_summary$precip_annual_min_in, + daymet.metric = daymet_summary$precip_annual_min_in, + nldas2.metric = nldas2_summary$precip_annual_min_in, + title = "yearly min precip") -analytics <- data.frame(metric, prism.summary)#, daymet.summary, nldas2.summary) -precip_annual_max_in <- analytics$prism.summary[1] diff --git a/HARP-2024-2025/variable definitions.R b/HARP-2024-2025/variable definitions.R new file mode 100644 index 00000000..d5e029e7 --- /dev/null +++ b/HARP-2024-2025/variable definitions.R @@ -0,0 +1,6 @@ +coverage_hydrocode <- "usgs_ws_01613900" +coverage_bundle <- "watershed" #type of coverage we are looking for +coverage_ftype <- "usgs_full_drainage" #feature type +model_version <- "cbp-6.1" +met_file <- "http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv" +scenario_name <- "stormVol_prism" From 6b6ecde6069a78f601eb9c9c63a42a4db24b0b17 Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Fri, 25 Oct 2024 10:43:15 -0400 Subject: [PATCH 5/8] 10.25 metrics commit --- HARP-2024-2025/VAHydro_metric_script.R | 116 +++++++++++++++++++++++ HARP-2024-2025/precip_coverage_summary.R | 61 ++++++++++++ HARP-2024-2025/yearly_metrics.R | 44 +++++++++ 3 files changed, 221 insertions(+) create mode 100644 HARP-2024-2025/VAHydro_metric_script.R create mode 100644 HARP-2024-2025/precip_coverage_summary.R create mode 100644 HARP-2024-2025/yearly_metrics.R diff --git a/HARP-2024-2025/VAHydro_metric_script.R b/HARP-2024-2025/VAHydro_metric_script.R new file mode 100644 index 00000000..89330e30 --- /dev/null +++ b/HARP-2024-2025/VAHydro_metric_script.R @@ -0,0 +1,116 @@ +# Inputs (args): +# 1 = path to precip data you want to use +# 2 = path of new csv +suppressPackageStartupMessages(library(dataRetrieval)) +suppressPackageStartupMessages(library(lubridate)) +suppressPackageStartupMessages(library(sqldf)) +suppressPackageStartupMessages(library(dplyr)) +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 2) { + message("Usage: Rscript usgsdata.R gage_id output_path") + q() +} + +read_path <- args[1] +write_path <- args[2] + +message("Pull csv from input file path") + +df <- read.csv(read_path) + +summary_analytics <- function(df){ + + #creating a yearly summary with each year and its total precip + yearly.summary <- + sqldf( + "SELECT yr, SUM(precip_in) AS total_precip + FROM df + GROUP BY yr" + ) + + #summary analytics + + precip_annual_max_in <- + max(yearly.summary$total_precip) + + precip_annual_max_year <- + yearly.summary$yr[which.max(yearly.summary$total_precip)] + + precip_annual_mean_in <- + mean(yearly.summary$total_precip) + + + #For min values and years, we can exclude the first and last row since the + #current year and first years are incomplete data + precip_annual_min_in <- + min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) + precip_annual_min_year <- + yearly.summary$yr[which.min + (yearly.summary$total_precip[c(-nrow + (yearly.summary),-1)])] + + + + # Create daily summary to use for all data. This makes hourly data daily sums. + daily.summary <- + sqldf( + "SELECT yr, mo, da, obs_date, SUM(precip_in) AS total_precip + FROM df + GROUP BY yr, mo, da" + ) #include obs_date after SQL to data frame + precip_daily_max_in <- max(daily.summary$total_precip) + #move to beginning + + + #if else statement evaluates the amount of unique hours and if 24, + #then hourly max is taken. If not, hourly max is NA + if(length(unique(df$hr)) == 24){ + precip_hourly_max_in <- max(df$precip_in) + } else { + precip_hourly_max_in <- NA + } + #Alternatively to a null value for hourly precip in daily data, + #we could look at the rainfall distribution table for a + #type II storm and multiply by the max P(t)/P(24) value. + #this function uses NULL for daily data + + + #l90 using zoo package + l90_precip_in_roll <- + min(rollapply(daily.summary$total_precip, width = 90, FUN = mean, + fill = NA, align = "right"), + na.rm = TRUE) + + #l90 using IHA + if(length(unique(df$hr)) == 24){ + Qout_zoo <- zoo(as.numeric(daily.summary$total_precip), order.by = as.POSIXct(daily.summary$obs_date)) + Qout_g2 <- data.frame(group2(Qout_zoo)) + l90_precip_in <- min(Qout_g2$X90.Day.Min) + } else { + Qout_zoo <- zoo(as.numeric(df$precip_in), order.by = as.POSIXct(df$obs_date)) + Qout_g2 <- data.frame(group2(Qout_zoo)) + l90_precip_in <- min(Qout_g2$X90.Day.Min) + } + + + + #makes data frame with all 9 metrics + metrics<- data.frame(precip_annual_max_in = precip_annual_max_in, + precip_annual_max_year = precip_annual_max_year, + precip_annual_mean_in = precip_annual_mean_in, + precip_annual_min_in = precip_annual_min_in, + precip_annual_min_year = precip_annual_min_year, + precip_daily_max_in = precip_daily_max_in, + precip_hourly_max_in = precip_hourly_max_in, + l90_precip_in = l90_precip_in, + l90_precip_in_roll = l90_precip_in_roll) +} + + +summary_analytics(df) +#What would be the best way to format the output csv? + + +message(paste0("Write csv in new file path: ",write_path)) +write.csv(precip_metric,write_path) + diff --git a/HARP-2024-2025/precip_coverage_summary.R b/HARP-2024-2025/precip_coverage_summary.R new file mode 100644 index 00000000..12bca0ba --- /dev/null +++ b/HARP-2024-2025/precip_coverage_summary.R @@ -0,0 +1,61 @@ +# Insert info about this event into database +# attach a property showing the path to the original raster file +# attach a property with a web URL to the origina raster file +basepath='/var/www/R'; +source("/var/www/R/config.R") +library("lubridate") +library("hydrotools") +# Set up our data source +source(paste0(HARParchive_location,"/HARP-2024-2025/basic_analytics_for_vahydro.R")) + +ds <- RomDataSource$new(site, rest_uname = rest_uname) +ds$get_token(rest_pw) + +# Accepting command arguments: +argst <- commandArgs(trailingOnly = T) +if (length(argst) < 7) { + + message("Use: Rscript met_store_info.R $ddate datasource coverage_hydrocode coverage_bundle coverage_ftype model_version met_file") + message("Ex: Rscript met_store_info.R $ddate nldas2 N5113 landunit cbp6_landseg met_file") + q('n') +} +ddate <- argst[1] +scenario_name <- argst[2] +coverage_hydrocode <- argst[3] +coverage_bundle <- argst[4] +coverage_ftype <- argst[5] +model_version <- argst[6] +met_file <- argst[7] +# load the feature -- get feature hydroid +# find the dh_timeseries_weather record for this event +# attach an image property to the record +# return + message(paste("Searching for feature hydrocode=", coverage_hydrocode,"with ftype",coverage_ftype)) +feature <- RomFeature$new( + ds, + list( + hydrocode=coverage_hydrocode, + ftype=coverage_ftype, + bundle=coverage_bundle + ), + TRUE +) +# this will create or retrieve a model scenario to house this summary data. +model <- om_model_object(ds, feature, model_version) +# if a matching model does not exist, this will go ahead and create one +scenario <- om_get_model_scenario(ds, model, scenario_name) + +met_data <- read.table(met_file, header = TRUE, sep=",") + + +summary <- summary_analytics(met_data) + +numrecs <- nrow(met_data) # what does this do? +vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'num_records', numrecs, ds) + +vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'precip_annual_max_in', summary$precip_annual_max_in, ds) + + + + + diff --git a/HARP-2024-2025/yearly_metrics.R b/HARP-2024-2025/yearly_metrics.R new file mode 100644 index 00000000..53e9fd1f --- /dev/null +++ b/HARP-2024-2025/yearly_metrics.R @@ -0,0 +1,44 @@ +#number of days with precip_in and longest duration storm per yr + +library(sqldf) + +#this function uses a series of SQL chunks to compute yearly data frames +#that include helpful QA metrics. +yearly_metrics <- function(df){ +# Create Daily summary to make all data daily +daily_summary <- + sqldf( + "SELECT yr, mo, da, obs_date, SUM(precip_in) AS precip_in + FROM df + GROUP BY yr, mo, da" + ) #include obs_date after SQL to data frame + +# Number of days with precip_in per yr +rainfall_days <- sqldf("SELECT yr, COUNT(*) + AS DaysWithRainfall + FROM daily_summary + WHERE precip_in > 0 + GROUP BY yr") + +# Longest duration storm per yr +storm_events <- sqldf("WITH RainEvents AS ( + SELECT obs_date, precip_in, yr, + (SELECT SUM(precip_in = 0) + FROM daily_summary rd2 + WHERE rd2.obs_date <= rd1.obs_date) + AS StormGroup + FROM daily_summary rd1), + StormDurations AS ( + SELECT yr, StormGroup, COUNT(*) + AS StormDuration + FROM RainEvents + WHERE precip_in > 0 + GROUP BY yr, StormGroup) + SELECT yr, MAX(StormDuration) AS LongestStorm + FROM StormDurations + GROUP BY yr") + +# Printing results +merge(rainfall_days, storm_events) +} +## It seems like NLDAS has much higher results... \ No newline at end of file From c37a8b774a5160708956fe2f3063814a161be10c Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Mon, 4 Nov 2024 12:01:37 -0500 Subject: [PATCH 6/8] better visualizations and minor changes including commenting --- HARP-2024-2025/basic_analytics_for_vahydro.R | 119 ------------------- HARP-2024-2025/precip_coverage_summary.R | 2 +- HARP-2024-2025/using_basic_analytics.R | 49 +++++--- HARP-2024-2025/yearly_metrics.R | 48 +++++--- 4 files changed, 69 insertions(+), 149 deletions(-) diff --git a/HARP-2024-2025/basic_analytics_for_vahydro.R b/HARP-2024-2025/basic_analytics_for_vahydro.R index 44f39439..a54ce3f4 100644 --- a/HARP-2024-2025/basic_analytics_for_vahydro.R +++ b/HARP-2024-2025/basic_analytics_for_vahydro.R @@ -95,122 +95,3 @@ metrics<- data.frame(precip_annual_max_in = precip_annual_max_in, l90_precip_in_roll = l90_precip_in_roll) } - -#----Ignore old functions -# For Daymet - -daymet_summary_analytics <- function(df){ - df <- - separate (data = df, - col = obs_date, - into = c("obs_date","obs_time"), - sep = " ") - df <- - separate (data = df, - col = obs_date, - into = c("obs_year","obs_month","obs_day"), - sep = "-") - - #creating a yearly summary with each year and its total precip - yearly.summary <- - sqldf( - "SELECT obs_year, SUM(precip_in) AS total_precip - FROM df - GROUP BY obs_year" - ) - - #summary analytics - - precip_annual_max_in <- - max(yearly.summary$total_precip) - - precip_annual_max_year <- - yearly.summary$obs_year[which.max(yearly.summary$total_precip)] - - precip_annual_mean_in <- - mean(yearly.summary$total_precip) - - - #For min values and years, we can exclude the last row since the current year - #is incomplete data - - precip_annual_min_in <- - min(yearly.summary$total_precip[-nrow(yearly.summary)]) - precip_annual_min_year <- - yearly.summary$obs_year[which.min - (yearly.summary$total_precip[-nrow - (yearly.summary)])] - - - precip_daily_max_in <- max(df$precip_in) - precip_hourly_max_in <- max(df$precip_in)/24 - #precip_hourly_max_in done using interpolation (not super realistic). - #Alternatively, we could look at the rainfall distribution table for a - #type II storm and multiply by the max P(t)/P(24) value. - - #not sure about last metric, 190_precip_in. - print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, - precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, - precip_hourly_max_in)) -} - -# For nldas2 - -nldas2_summary_analytics <- function(df){ - df <- - separate (data = df, - col = obs_date, - into = c("obs_date","obs_time"), - sep = " ") - df <- - separate (data = df, - col = obs_date, - into = c("obs_year","obs_month","obs_day"), - sep = "-") - - #creating a yearly summary with each year and its total precip - yearly.summary <- - sqldf( - "SELECT obs_year, SUM(precip_in) AS total_precip - FROM df - GROUP BY obs_year" - ) - - #summary analytics - - precip_annual_max_in <- - max(yearly.summary$total_precip) - - precip_annual_max_year <- - yearly.summary$obs_year[which.max(yearly.summary$total_precip)] - - precip_annual_mean_in <- - mean(yearly.summary$total_precip) - - - #For min values and years, we can exclude the last row since the current year - #is incomplete data - - precip_annual_min_in <- - min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) - precip_annual_min_year <- - yearly.summary$obs_year[which.min - (yearly.summary$total_precip[c(-nrow - (yearly.summary),-1)])] - daily.summary <- - sqldf( - "SELECT obs_year, obs_month, obs_day, SUM(precip_in) AS total_precip - FROM df - GROUP BY obs_year, obs_month, obs_day" - ) - - precip_daily_max_in <- max(daily.summary$total_precip[-1]) - - precip_hourly_max_in <- max(df$precip_in) - - - #not sure about last metric, 190_precip_in. - print(c(precip_annual_max_in, precip_annual_max_year, precip_annual_mean_in, - precip_annual_min_in, precip_annual_min_year, precip_daily_max_in, - precip_hourly_max_in)) -} diff --git a/HARP-2024-2025/precip_coverage_summary.R b/HARP-2024-2025/precip_coverage_summary.R index 12bca0ba..d5a414b4 100644 --- a/HARP-2024-2025/precip_coverage_summary.R +++ b/HARP-2024-2025/precip_coverage_summary.R @@ -48,7 +48,7 @@ scenario <- om_get_model_scenario(ds, model, scenario_name) met_data <- read.table(met_file, header = TRUE, sep=",") -summary <- summary_analytics(met_data) +precip_summary <- summary_analytics(met_data) numrecs <- nrow(met_data) # what does this do? vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'num_records', numrecs, ds) diff --git a/HARP-2024-2025/using_basic_analytics.R b/HARP-2024-2025/using_basic_analytics.R index 1d16c2ca..0294e685 100644 --- a/HARP-2024-2025/using_basic_analytics.R +++ b/HARP-2024-2025/using_basic_analytics.R @@ -1,5 +1,7 @@ source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-2024-2025/basic_analytics_for_vahydro.R") - +library("IHA") +library("ggplot2") +library(sqldf) prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") prism_summary <- summary_analytics(prism) @@ -20,19 +22,36 @@ nldas2_l90_precip_in <- nldas2_summary$l90_precip_in summary_analytics(prism)$l90_precip_in -visualizations <- function(prism.metric, daymet.metric, nldas2.metric, title = NULL){ -ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), -y = c(prism.metric, daymet.metric, nldas2.metric)))+ - geom_bar(stat = "identity", fill = "lightblue3")+ - labs(title = title, y = "precip (in)", x = "model")} - -visualizations(prism.metric = prism_summary$precip_annual_max_in, - daymet.metric = daymet_summary$precip_annual_max_in, - nldas2.metric = nldas2_summary$precip_annual_max_in, - title = "yearly max precip") -visualizations(prism.metric = prism_summary$precip_annual_min_in, - daymet.metric = daymet_summary$precip_annual_min_in, - nldas2.metric = nldas2_summary$precip_annual_min_in, - title = "yearly min precip") +#visualizing data: + +# visualizations <- function(prism.metric, daymet.metric, nldas2.metric, title = NULL){ +# ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), +# y = c(prism.metric, daymet.metric, nldas2.metric)))+ +# geom_bar(stat = "identity", fill = "lightblue3")+ +# labs(title = title, y = "precip (in)", x = "model")} +# +# visualizations(prism.metric = prism_summary$precip_annual_max_in, +# daymet.metric = daymet_summary$precip_annual_max_in, +# nldas2.metric = nldas2_summary$precip_annual_max_in, +# title = "yearly max precip") +# visualizations(prism.metric = prism_summary$precip_annual_min_in, +# daymet.metric = daymet_summary$precip_annual_min_in, +# nldas2.metric = nldas2_summary$precip_annual_min_in, +# title = "yearly min precip") + +visualizations <- function(gage_id, metric){ + ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), + y = c(summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/",gage_id,"-PRISM-all.csv")))[[metric]], + summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/daymet/precip/",gage_id,"-daymet-all.csv")))[[metric]], + summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/nldas2/precip/",gage_id,"-nldas2-all.csv")))[[metric]])))+ + geom_bar(stat = "identity", fill = "lightblue3")+ + labs(title = as.character(metric), y = "precip (in)", x = "model")} + +# perform basic bar plot visualizations for any of these 5 metrics for any gage +visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_max_in") +visualizations(gage_id = "usgs_ws_01613900", metric = "l90_precip_in") +visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_mean_in") +visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_min_in") +visualizations(gage_id = "usgs_ws_01613900", metric = "precip_daily_max_in") diff --git a/HARP-2024-2025/yearly_metrics.R b/HARP-2024-2025/yearly_metrics.R index 53e9fd1f..8b0c1cfe 100644 --- a/HARP-2024-2025/yearly_metrics.R +++ b/HARP-2024-2025/yearly_metrics.R @@ -14,31 +14,51 @@ daily_summary <- ) #include obs_date after SQL to data frame # Number of days with precip_in per yr +rainfall_hist <- ggplot(data = daily_summary, mapping = aes(x = obs_date, y = precip_in))+ + geom_histogram() rainfall_days <- sqldf("SELECT yr, COUNT(*) AS DaysWithRainfall FROM daily_summary - WHERE precip_in > 0 + WHERE precip_in > 0.01 GROUP BY yr") -# Longest duration storm per yr +# # Longest duration storm per yr +# this marks the storm groups by separating due to days with no rainfall +#the NWS states that >0.01 is "measurable precipitation". +#this changes our numbers by ~20 days for the Hogue creek gage storm_events <- sqldf("WITH RainEvents AS ( - SELECT obs_date, precip_in, yr, - (SELECT SUM(precip_in = 0) - FROM daily_summary rd2 - WHERE rd2.obs_date <= rd1.obs_date) - AS StormGroup - FROM daily_summary rd1), + SELECT obs_date, precip_in, yr, + (SELECT SUM(precip_in <= 0.01) + FROM daily_summary rd2 + WHERE rd2.obs_date <= rd1.obs_date) + AS StormGroup + FROM daily_summary rd1), StormDurations AS ( - SELECT yr, StormGroup, COUNT(*) + SELECT yr, StormGroup, COUNT(*) AS StormDuration FROM RainEvents - WHERE precip_in > 0 + WHERE precip_in > 0.01 GROUP BY yr, StormGroup) - SELECT yr, MAX(StormDuration) AS LongestStorm - FROM StormDurations + SELECT yr, MAX(StormDuration) AS LongestStorm + FROM StormDurations GROUP BY yr") # Printing results -merge(rainfall_days, storm_events) +#merge(rainfall_days) +return(merge(rainfall_days,storm_events)) } -## It seems like NLDAS has much higher results... \ No newline at end of file +## It seems like NLDAS has much higher results... + +yearly_metrics(prism) +yearly_metrics(nldas2) + + +daily_summary <- + sqldf( + "SELECT yr, mo, da, obs_date, SUM(precip_in) AS precip_in + FROM df + GROUP BY yr, mo, da" + ) + +rainfall_hist <- ggplot(data = daily_summary, mapping = aes(x = precip_in))+ + geom_histogram(binwidth = 0.01) From 1ead13cc050a22d833eb533be12bfbf90067ba20 Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Fri, 8 Nov 2024 09:28:04 -0500 Subject: [PATCH 7/8] updates for integration --- HARP-2024-2025/basic_analytics_for_vahydro.R | 11 ++++----- HARP-2024-2025/precip_coverage_summary.R | 22 +++++++++--------- HARP-2024-2025/using_basic_analytics.R | 24 ++++++++++---------- HARP-2024-2025/yearly_metrics.R | 9 ++++++++ 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/HARP-2024-2025/basic_analytics_for_vahydro.R b/HARP-2024-2025/basic_analytics_for_vahydro.R index a54ce3f4..0aa73393 100644 --- a/HARP-2024-2025/basic_analytics_for_vahydro.R +++ b/HARP-2024-2025/basic_analytics_for_vahydro.R @@ -12,9 +12,10 @@ summary_analytics <- function(df){ #creating a yearly summary with each year and its total precip yearly.summary <- sqldf( - "SELECT yr, SUM(precip_in) AS total_precip + "SELECT COUNT(*) AS ndays, yr, SUM(precip_in) AS total_precip FROM df - GROUP BY yr" + GROUP BY yr + HAVING ndays >= 365" ) #summary analytics @@ -32,11 +33,9 @@ precip_annual_mean_in <- #For min values and years, we can exclude the first and last row since the #current year and first years are incomplete data precip_annual_min_in <- - min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) + min(yearly.summary$total_precip) precip_annual_min_year <- - yearly.summary$yr[which.min - (yearly.summary$total_precip[c(-nrow - (yearly.summary),-1)])] + yearly.summary$yr[which.min(yearly.summary$total_precip)] diff --git a/HARP-2024-2025/precip_coverage_summary.R b/HARP-2024-2025/precip_coverage_summary.R index d5a414b4..735d67db 100644 --- a/HARP-2024-2025/precip_coverage_summary.R +++ b/HARP-2024-2025/precip_coverage_summary.R @@ -13,19 +13,18 @@ ds$get_token(rest_pw) # Accepting command arguments: argst <- commandArgs(trailingOnly = T) -if (length(argst) < 7) { +if (length(argst) < 6) { - message("Use: Rscript met_store_info.R $ddate datasource coverage_hydrocode coverage_bundle coverage_ftype model_version met_file") - message("Ex: Rscript met_store_info.R $ddate nldas2 N5113 landunit cbp6_landseg met_file") + message("Use: Rscript met_store_info.R datasource coverage_hydrocode coverage_bundle coverage_ftype model_version met_file") + message("Ex: Rscript met_store_info.R nldas2 N5113 landunit cbp6_landseg met_file") q('n') } -ddate <- argst[1] -scenario_name <- argst[2] -coverage_hydrocode <- argst[3] -coverage_bundle <- argst[4] -coverage_ftype <- argst[5] -model_version <- argst[6] -met_file <- argst[7] +scenario_name <- argst[1] +coverage_hydrocode <- argst[2] +coverage_bundle <- argst[3] +coverage_ftype <- argst[4] +model_version <- argst[5] +met_file <- argst[6] # load the feature -- get feature hydroid # find the dh_timeseries_weather record for this event # attach an image property to the record @@ -53,8 +52,9 @@ precip_summary <- summary_analytics(met_data) numrecs <- nrow(met_data) # what does this do? vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'num_records', numrecs, ds) -vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'precip_annual_max_in', summary$precip_annual_max_in, ds) +vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'precip_annual_max_in', precip_summary$precip_annual_max_in, ds) +vahydro_post_metric_to_scenprop(scenario$pid, 'om_class_Constant', NULL, 'precip_annual_min_in', precip_summary$precip_annual_max_in, ds) diff --git a/HARP-2024-2025/using_basic_analytics.R b/HARP-2024-2025/using_basic_analytics.R index 0294e685..84c8ad3e 100644 --- a/HARP-2024-2025/using_basic_analytics.R +++ b/HARP-2024-2025/using_basic_analytics.R @@ -2,21 +2,21 @@ source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-202 library("IHA") library("ggplot2") library(sqldf) -prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") -prism_summary <- summary_analytics(prism) - - -daymet <- read.csv("http://deq1.bse.vt.edu:81/met/daymet/precip/usgs_ws_01613900-daymet-all.csv") -daymet_summary <- summary_analytics(daymet) - -nldas2 <- read.csv("http://deq1.bse.vt.edu:81/met/nldas2/precip/usgs_ws_01613900-nldas2-all.csv") -nldas2_summary <- summary_analytics(nldas2) #issues with l90 -- zoo unique values +# prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") +# prism_summary <- summary_analytics(prism) +# +# +# daymet <- read.csv("http://deq1.bse.vt.edu:81/met/daymet/precip/usgs_ws_01613900-daymet-all.csv") +# daymet_summary <- summary_analytics(daymet) +# +# nldas2 <- read.csv("http://deq1.bse.vt.edu:81/met/nldas2/precip/usgs_ws_01613900-nldas2-all.csv") +# nldas2_summary <- summary_analytics(nldas2) #pull any value like this: -prism_l90_precip_in <- prism_summary$l90_precip_in -daymet_l90_precip_in <- daymet_summary$l90_precip_in -nldas2_l90_precip_in <- nldas2_summary$l90_precip_in +# prism_l90_precip_in <- prism_summary$l90_precip_in +# daymet_l90_precip_in <- daymet_summary$l90_precip_in +# nldas2_l90_precip_in <- nldas2_summary$l90_precip_in #Or even without the summary: diff --git a/HARP-2024-2025/yearly_metrics.R b/HARP-2024-2025/yearly_metrics.R index 8b0c1cfe..f8716b53 100644 --- a/HARP-2024-2025/yearly_metrics.R +++ b/HARP-2024-2025/yearly_metrics.R @@ -43,9 +43,18 @@ storm_events <- sqldf("WITH RainEvents AS ( FROM StormDurations GROUP BY yr") +# RainEvents <- sqldf(" +# SELECT obs_date, precip_in, yr, +# (SELECT SUM(precip_in <= 0.01) +# FROM daily_summary AS rd2 +# WHERE rd2.obs_date <= rd1.obs_date) +# AS StormGroup +# FROM daily_summary AS rd1") + # Printing results #merge(rainfall_days) return(merge(rainfall_days,storm_events)) +return(RainEvents) } ## It seems like NLDAS has much higher results... From 93c19c326435e3ae033b8bceb2ce84881b805260 Mon Sep 17 00:00:00 2001 From: Nate Foley Date: Fri, 8 Nov 2024 09:49:41 -0500 Subject: [PATCH 8/8] updates for integration --- HARP-2024-2025/DataSourceEvaluations.R | 147 ------- HARP-2024-2025/GIS script.R | 140 ------ HARP-2024-2025/StormResults_Analysis.R | 394 ----------------- HARP-2024-2025/VAHydro_metric_script.R | 116 ----- HARP-2024-2025/access-db.R | 167 ------- HARP-2024-2025/access-file.R | 418 ------------------ HARP-2024-2025/assessEventPrecip.R | 312 -------------- HARP-2024-2025/caching function.R | 33 -- HARP-2024-2025/dni_data_experiments.R | 9 - HARP-2024-2025/hydroimport_daily.R | 49 --- HARP-2024-2025/make_weekly_summary_ts.R | 61 --- HARP-2024-2025/retrieve_vahydro_metrics.R | 12 - HARP-2024-2025/stormSep_USGS.R | 504 ---------------------- HARP-2024-2025/storm_event_ratings.R | 39 -- HARP-2024-2025/using_basic_analytics.R | 57 --- HARP-2024-2025/variable definitions.R | 6 - HARP-2024-2025/yearly_metrics.R | 73 ---- 17 files changed, 2537 deletions(-) delete mode 100644 HARP-2024-2025/DataSourceEvaluations.R delete mode 100644 HARP-2024-2025/GIS script.R delete mode 100644 HARP-2024-2025/StormResults_Analysis.R delete mode 100644 HARP-2024-2025/VAHydro_metric_script.R delete mode 100644 HARP-2024-2025/access-db.R delete mode 100644 HARP-2024-2025/access-file.R delete mode 100644 HARP-2024-2025/assessEventPrecip.R delete mode 100644 HARP-2024-2025/caching function.R delete mode 100644 HARP-2024-2025/dni_data_experiments.R delete mode 100644 HARP-2024-2025/hydroimport_daily.R delete mode 100644 HARP-2024-2025/make_weekly_summary_ts.R delete mode 100644 HARP-2024-2025/retrieve_vahydro_metrics.R delete mode 100644 HARP-2024-2025/stormSep_USGS.R delete mode 100644 HARP-2024-2025/storm_event_ratings.R delete mode 100644 HARP-2024-2025/using_basic_analytics.R delete mode 100644 HARP-2024-2025/variable definitions.R delete mode 100644 HARP-2024-2025/yearly_metrics.R diff --git a/HARP-2024-2025/DataSourceEvaluations.R b/HARP-2024-2025/DataSourceEvaluations.R deleted file mode 100644 index 14c0c02f..00000000 --- a/HARP-2024-2025/DataSourceEvaluations.R +++ /dev/null @@ -1,147 +0,0 @@ -# Load in libraries -library(sqldf) -library(ggplot2) -library(tidyverse) - - -################################################################################ -# PRISM List of files - -list_of_prism <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-prism", - recursive = TRUE, - pattern = "\\.csv$", - full.names = TRUE) -prism_list <- read_csv(list_of_prism, id="gageid") - -prism_list$gageid <- grep("0", unlist(strsplit(prism_list$gageid, "_")), value = TRUE, perl = TRUE) -prism_list$gageid <- grep("0", unlist(strsplit(prism_list$gageid, "-")), value = TRUE, perl = TRUE) -prism_list$...1 <- NULL - -prism_gages <- as.list(unique(prism_list$gageid)) - -################################################################################ -# DAYMET List of files - -list_of_daymet <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-daymet/out", - recursive = TRUE, - pattern = "\\.csv$", - full.names = TRUE) -daymet_list <- read_csv(list_of_daymet, id="gageid") - -daymet_list$gageid <- grep("0", unlist(strsplit(daymet_list$gageid, "_")), value = TRUE, perl = TRUE) -daymet_list$gageid <- grep("0", unlist(strsplit(daymet_list$gageid, "-")), value = TRUE, perl = TRUE) -daymet_list$...1 <- NULL - -daymet_gages <- as.list(unique(daymet_list$gageid)) - -################################################################################ -# NLDAS2 List of Files - -list_of_nldas2 <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out-daymet/out", - recursive = TRUE, - pattern = "\\.csv$", - full.names = TRUE) -nldas2_list <- read_csv(list_of_nldas2, id="gageid") - -nldas2_list$gageid <- grep("0", unlist(strsplit(nldas2_list$gageid, "_")), value = TRUE, perl = TRUE) -nldas2_list$gageid <- grep("0", unlist(strsplit(nldas2_list$gageid, "-")), value = TRUE, perl = TRUE) -nldas2_list$...1 <- NULL - -nldas2_gages <- as.list(unique(nldas2_list$gageid)) - -################################################################################ -# Find gages which ran for each data source - -complete_gages <- as.list(intersect(prism_gages,daymet_gages)) -complete_gages <- as.list(as.numeric(intersect(complete_gages,nldas2_gages))) - -################################################################################ -# -for(j in c("daymet","prism","nldas")){ - print(j) - #Set path to read in ratings based on data source of outer loop - pathToread <- paste0("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/",j,"/") - for( i in list.files(pathToread) ){ - filei <- read.csv(paste0(pathToread,i)) - #Store the gage number - filei$gage <- gsub(".*_(\\d+)-.*","\\1",i) - #Combine the file into a single data frame - if(!exists("combineFile")){ - combineFile <- filei - }else{ - combineFile <- rbind(combineFile,filei) - } - } - #Assign to a specific variable and delete the generic combineFile - assign(paste0('combineFile',j),combineFile) - rm(combineFile) -} - -#Join the daymet and prism data together -joinData <- combineFileprism %>% - #Remove the row numbers (not important) and rename the r_squared for clarity - select(-X,prismRating = r_squared) %>% - #Join in the dayment data, but first rename the r_squared column for clarity. - #Join on gage and month - left_join(combineFiledaymet %>% - select(daymetRating = r_squared,gage,mo), - by = c("gage","mo")) %>% - #Join in the nldas data, but first rename the r_squared column for clarity. - #Join on gage and month - left_join(combineFilenldas %>% - select(nldasRating = r_squared,gage,mo), - by = c("gage","mo")) %>% - #Pivot it longer to have a column with the data source and one for the - #ratings, for plotting ease - pivot_longer(c(prismRating,daymetRating,nldasRating), - names_to = 'dataSource', - values_to = 'rating') - - - -################################################################################ -# Import and add drainage are from USGS -drainage_area <- dataRetrieval::readNWISsite(c(unique(joinData$gage))) -joinData <- sqldf( - "select a.mo as mo, a.gage as gageid, a.dataSource as dataSource, a.rating as rating, - b.drain_area_va as drainageArea_sqmi - from joinData as a - left outer join drainage_area as b - on ( - a.gage = b.site_no - )" -) - -joinData$dataSource <- gsub("Rating","",joinData$dataSource) - - -################################################################################ -# Trends in DA and ratings - -ggplot(data=joinData)+ - aes(y=mean(joinData$rating, group_by(gageid)),x=joinData$drainageArea_sqmi) - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/HARP-2024-2025/GIS script.R b/HARP-2024-2025/GIS script.R deleted file mode 100644 index fbcb3196..00000000 --- a/HARP-2024-2025/GIS script.R +++ /dev/null @@ -1,140 +0,0 @@ -library(tidyverse) -#Assumes all ratings files are in a folder ratings/daymet/ or ratings/prism/ -for(j in c("daymet","prism","nldas")){ - print(j) - #Set path to read in ratings based on data source of outer loop - pathToread <- paste0("C:/Users/natef/OneDrive - Virginia Tech/HARP/Ratings/",j,"/") - for( i in list.files(pathToread) ){ - #For each file in the directory, read in the file i as a csv - filei <- read.csv(paste0(pathToread,i)) - #Store the gage number - filei$gage <- gsub(".*_(\\d+)-.*","\\1",i) - #Store the analysis type - filei$workflow <- gsub(".*_","",j) - #Keep only the necessary columns, depending on the workflow: - if(filei$workflow[1] == "simplelm"){ - filei <- filei[,c("mo","rating","gage","workflow")] - }else{ - filei <- filei[,c("mo","r_squared","gage","workflow")] - } - names(filei) <- c("mo","rating","gage","workflow") - #Combine the file into a single data frame - if(!exists("combineFile")){ - combineFile <- filei - }else{ - combineFile <- rbind(combineFile,filei) - } - } - #Assign to a specific variable and delete the generic combineFile - assign(paste0('combineFile',j),combineFile) - rm(combineFile) -} - -#Join the daymet and prism data together -joinData <- combineFileprism %>% - #Rename the rating for clarity - select(prismRatingstorm = rating,gage,mo,workflow) %>% - #Join in the dayment data, but first rename the rating column for clarity. - #Join on gage, month, and workflow - full_join(combineFiledaymet %>% - select(daymetRatingstorm = rating,gage,mo,workflow), - by = c("gage","mo","workflow")) %>% - full_join(combineFiledaymet %>% - select(daymetRatinglm = rating,gage,mo,workflow), - by = c("gage","mo","workflow")) %>% - #Add remaining prism data: - full_join(combineFileprism %>% - select(prismRatinglm = rating,gage,mo,workflow), - by = c("gage","mo","workflow")) %>% - #Join in the nldas data, but first rename the rating column for clarity. - #Join on gage, month, and workflow - full_join(combineFilenldas %>% - select(nldasRatingstorm = rating,gage,mo,workflow), - by = c("gage","mo","workflow")) %>% - full_join(combineFilenldas %>% - select(nldasRatinglm = rating,gage,mo,workflow), - by = c("gage","mo","workflow")) %>% - #For easier viewing, combine lm and storm columns such that there is only one - #column for prism, daymet, nldas classified by the workflow column - mutate(prismRating = coalesce(prismRatingstorm,prismRatinglm), - daymetRating = coalesce(daymetRatingstorm,daymetRatinglm), - nldasRating = coalesce(nldasRatingstorm,nldasRatinglm) - ) %>% - #Remove now superflous columns: - select(-prismRatingstorm,-prismRatinglm, - -daymetRatingstorm,-daymetRatinglm, - -nldasRatingstorm,-nldasRatinglm) %>% - #Pivot it longer to have a column with the data source and one for the - #ratings, for plotting ease - pivot_longer(c(prismRating,daymetRating,nldasRating), - names_to = 'dataSource', - values_to = 'rating') %>% - arrange(gage,mo,workflow) - -#At each gage, does the best performing data source change between workflows? -#The below code is for ESTIMATES ONLY. The left_join assumes that the ratings -#are unique between datasources for each gage, workflow, month. This is brittle -#and could result in incorrect answers! - -safe_max <- function(x) { - if (length(na.omit(x)) == 0) { - return(NA) # or any other placeholder you prefer - } else { - return(max(x, na.rm = TRUE)) - } -} - -gageCompare <- joinData %>% dplyr::ungroup() %>% - #Group by gage, workflow, and month and find the max rating: - dplyr::group_by(gage,workflow,mo) %>% - dplyr::summarise(maxRating = mean(safe_max(rating))) %>% - #Join the joinData df back in matching by gage, workflow, mo, and rating. This - #could be problematic with duplicate ratings as in a case where all ratings - #across the data sources are the same value - left_join(joinData,by = c('gage','workflow','mo','maxRating' = 'rating')) %>% - #Now pivot wider such that we can see ratings and best data sources side-by-side - pivot_wider(names_from = workflow,values_from = c(maxRating,dataSource)) %>% - #Now filter to only find instances where the best data sources are different: - filter(dataSource != dataSource) %>% - #Create a column showing the difference in rating and arrange by the difference - mutate(differenceInRating = maxRating - maxRating) %>% - arrange(differenceInRating) - - - -#Isolate one month -gageCompareMonth <- gageCompare[gageCompare$mo == 10,] - -library(sf) -#Get the watershed coverage from the server -watershedGeo <- read.csv("http://deq1.bse.vt.edu:81/met/usgsGageWatershedGeofield.csv") - -#Get the gage numbers as their own field and store a copy of the data -gageWatershedSF <- watershedGeo -gageWatershedSF$gage <- gsub(".*_(\\d+)","\\1",gageWatershedSF$hydrocode) - -#Let's begin by plotting October daymet ratings for stormVol -forPlot <- joinData[joinData$dataSource == 'daymetRating' & - joinData$workflow == 'stormvol' & - joinData$mo == 10,] -#Join the geometry data onto out plot data -joinDataSF <- forPlot %>% - left_join(gageWatershedSF %>% select(-hydrocode), - by = 'gage') %>% - filter(!is.na(wkt)) -#Create an SF object. Specify the coordinate system and the field name in the -#data frame that contains the well-known text. In this case, WKT is the name of -#the field with the polygon geometries -joinDataSF <- st_as_sf(joinDataSF,wkt = 'wkt',crs = 4326) -#Repair broken geometries -joinDataSF <- st_make_valid(joinDataSF) -#Add shape area in coordinate system units (likely meaningless in crs 4326) -joinDataSF$area <- st_area(joinDataSF) -#Order the data by largest to smallest area to make plotting more effective -joinDataSF <- joinDataSF[order(joinDataSF$area,decreasing = TRUE),] -#Plot SF polygons -ggplot(joinDataSF) + - geom_sf(aes(fill = rating)) + - scale_fill_viridis_c(option = 'magma') + - theme_classic() - diff --git a/HARP-2024-2025/StormResults_Analysis.R b/HARP-2024-2025/StormResults_Analysis.R deleted file mode 100644 index 95b5f5ad..00000000 --- a/HARP-2024-2025/StormResults_Analysis.R +++ /dev/null @@ -1,394 +0,0 @@ -# Key for Gages: -# 01620500 = North River near Stokesville -# 01634500 = Cedar Creek near Winchester -# 01669000 = Piscataway Creek near Tappahannock -# 02021500 = Maury River at Rockbridge Falls -# 02031000 = Mechums River near White Hall -# 02044500 = Nottoway River near Rawlings -# 03176500 = New River at Glen Lyn -# 03165000 = Chesnut Creek at Galax -# 02054530 = Roanoke River at Glenvar -# 02034000 = Rivanna River at Palmyra - -# sql, ggplot2 -library(sqldf) -library(ggplot2) -library(tidyverse) - -############################################################################## -# STORM FILE DATA COMPILATION - -list_of_files <- list.files(path="C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out", - recursive = TRUE, - pattern = "\\.csv$", - full.names = TRUE) -df <- read_csv(list_of_files, id="gageid") - -df$gageid <- grep("0", unlist(strsplit(df$gageid, "_")), value = TRUE, perl = TRUE) -df$gageid <- grep("0", unlist(strsplit(df$gageid, "-")), value = TRUE, perl = TRUE) -df$...1 <- NULL - -df$season <- ifelse (df$mo %in% c(6:8), - "SUMMER", - ifelse (df$mo %in% c(9:11), - "AUTUMN", - ifelse (df$mo %in% c(12,1,2), - "WINTER", - ifelse (df$mo %in% c(3:5), - "SPRING", NA)))) - -############################################################################## -# MONTHLY AND SEASONAL R-SQUARED PLOTS - -ggplot(data = df)+ - aes(y = season, x=r_squared, color=season)+ - geom_boxplot()+ - xlim(0,1)+ - theme_bw()+ - xlab("R-Squared")+ - ylab("Season")+ - ggtitle("Seasonal R-Squared Values ()")+ - theme(plot.title = element_text(hjust = 0.5), legend.position = "none") - - -ggplot(data = df)+ - aes(y = mo, x=r_squared)+ - geom_boxplot(mapping = aes(group = mo))+ - xlim(0,1)+ - theme_bw()+ - xlab("R-Squared")+ - ylab("Month")+ - ggtitle("Monthly R-Squared Values ()")+ - theme(plot.title = element_text(hjust = 0.5), legend.position = "none")+ - scale_y_continuous(breaks = seq(0,12,1)) - - -############################################################################ -# Highest and lowest avg R-squared - -gage_mean<- aggregate(x= df$r_squared, - # Specify group indicator - by = list(df$gageid), - # Specify function (i.e. mean) - FUN = mean, - na.rm = TRUE ) -print(gage_mean) - -ggplot(data = gage_mean, - mapping = aes(x=x))+ - geom_density(color="dodgerblue3")+ - theme_bw()+ - ylab("Density")+ - xlab("R-Squared")+ - ggtitle("Mean R-Squared Value Density for each Gage")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(0,1)+ - geom_vline(xintercept = mean(gage_mean$x), lwd=0.5,colour="purple4",linetype = "dashed") - -ggplot(data = df, - mapping = aes(x=r_squared))+ - geom_density(color="indianred4")+ - theme_bw()+ - ylab("Density")+ - xlab("R-Squared")+ - ggtitle("Total R-Squared Value Density")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(0,1)+ - geom_vline(xintercept = mean(df$r_squared, na.rm = TRUE), lwd=0.5,colour="deeppink3",linetype = "dashed") - - -############################################################################## -# Highest R-squared Gage analysis - -high_data <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out/usgs_ws_01652500-PRISM-storm_volume-ratings.csv") - -ggplot(data = high_data, - mapping = aes(x=mo,y=r_squared))+ - geom_line(color="magenta4")+ - theme_bw()+ - ylim(0,1)+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("Monthly R-Squared, Highest Mean")+ - theme(plot.title = element_text(hjust = 0.5)) - - -############################################################################# -# Lowest R-squared Gage analysis - -low_data <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Ratings/out/usgs_ws_01671025-PRISM-storm_volume-ratings.csv") - -ggplot(data = low_data, - mapping = aes(x=mo,y=r_squared))+ - geom_line(color="darkolivegreen3")+ - theme_bw()+ - ylim(-0.1,1)+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("Monthly R-Squared, Lowest Mean")+ - theme(plot.title = element_text(hjust = 0.5)) - -############################################################################# - -precip <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Peak Diff/usgs_ws_02024000-PRISM-daily.csv") -stormflow <- read.csv("C:/Users/ilona/OneDrive - Virginia Tech/HARP/Storm Tests/Peak Diff/usgs_ws_02024000-stormevent-flow.csv") - -precip$obs_date <- as_date(precip$obs_date) -stormflow$timestamp <- as_date(stormflow$timestamp) - -stormflow <- subset(stormflow, stormflow$timestamp >= min(precip$obs_date)) -stormflow <- subset(stormflow, stormflow$timestamp <= max(precip$obs_date)) - -compdata <- sqldf( - "select a.obs_date, a.precip_in, - b.flow, b.stormID - from precip as a - left outer join stormflow as b - on ( - a.yr = b.year - and a.obs_date = b.timestamp - )" -) - - - -############################################################################ -# Plot Ratings -ggplot(data=comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_01620500), color="darkred")+ - geom_line(aes(y=r_01629500), color="red3")+ - geom_line(aes(y=r_01634500), color="firebrick1")+ - geom_line(aes(y=r_01643700), color="orangered1")+ - geom_line(aes(y=r_01662800), color="darkorange1")+ - geom_line(aes(y=r_01669000), color="orange1")+ - geom_line(aes(y=r_01669520), color="goldenrod1")+ - geom_line(aes(y=r_02018500), color="yellow2")+ - geom_line(aes(y=r_02021500), color="yellowgreen")+ - geom_line(aes(y=r_02024000), color="palegreen3")+ - geom_line(aes(y=r_02026000), color="darkslategray3")+ - geom_line(aes(y=r_02029000), color="deepskyblue3")+ - geom_line(aes(y=r_02031000), color="dodgerblue4")+ - geom_line(aes(y=r_02032250), color="darkslateblue")+ - geom_line(aes(y=r_02034000), color="purple4")+ - geom_line(aes(y=r_02033500), color="darkorchid4")+ - geom_line(aes(y=r_02051000), color="darkviolet")+ - geom_line(aes(y=r_02054530), color="darkmagenta")+ - geom_line(aes(y=r_03165000), color="deeppink3")+ - geom_line(aes(y=r_03176500), color="hotpink")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - ylim(0,1) - - - -ggplot(data=comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=mean(r_01620500)), color="darkred")+ - geom_line(aes(y=mean(r_01629500)), color="red3")+ - geom_line(aes(y=mean(r_01634500)), color="firebrick1")+ - geom_line(aes(y=mean(r_01643700)), color="orangered1")+ - geom_line(aes(y=mean(r_01662800)), color="darkorange1")+ - geom_line(aes(y=mean(r_01669000)), color="orange1")+ - geom_line(aes(y=mean(r_01669520)), color="goldenrod1")+ - geom_line(aes(y=mean(r_02018500)), color="yellow2")+ - geom_line(aes(y=mean(r_02021500)), color="yellowgreen")+ - geom_line(aes(y=mean(r_02024000)), color="palegreen3")+ - geom_line(aes(y=mean(r_02026000)), color="darkslategray3")+ - geom_line(aes(y=mean(r_02029000)), color="deepskyblue3")+ - geom_line(aes(y=mean(r_02031000)), color="dodgerblue4")+ - geom_line(aes(y=mean(r_02032250)), color="darkslateblue")+ - geom_line(aes(y=mean(r_02034000)), color="purple4")+ - geom_line(aes(y=mean(r_02033500)), color="darkorchid4")+ - geom_line(aes(y=mean(r_02051000)), color="darkviolet")+ - geom_line(aes(y=mean(r_02054530)), color="darkmagenta")+ - geom_line(aes(y=mean(r_03165000)), color="deeppink3")+ - geom_line(aes(y=mean(r_03176500)), color="hotpink")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - ylim(0.25,0.75) - - -ggplot(data=comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=mean(r_03176500)), color="hotpink") - - -ggplot(data = comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_01620500), color="red3")+ - geom_line(aes(y=r_01634500), color="orange2")+ - geom_line(aes(y=r_01669000), color="gold1")+ - geom_line(aes(y=r_02021500), color="yellowgreen")+ - geom_line(aes(y=r_02031000), color="darkturquoise")+ - geom_line(aes(y=r_02034000), color="dodgerblue")+ - geom_line(aes(y=r_02044500), color="dodgerblue4")+ - geom_line(aes(y=r_02054530), color="purple3")+ - geom_line(aes(y=r_03165000), color="orchid3")+ - geom_line(aes(y=r_03176500), color="hotpink")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - geom_rect(aes(xmin = 6, xmax = 8, ymin = 0.025, ymax = 0.55), - fill="transparent", color="red", size=0.75)+ - geom_rect(aes(xmin = 3, xmax = 5, ymin = 0.55, ymax = 0.85), - fill="transparent", color="red", size=0.75)+ - geom_rect(aes(xmin = 9.25, xmax = 10.75, ymin = 0.4, ymax = 0.85), - fill="transparent", color="red", size=0.75)+ - geom_rect(aes(xmin = 1.5, xmax = 2.5, ymin = 0.45, ymax = 0.85), - fill="transparent", color="red", size=0.75)+ - ylim(0,1) - - -# July Dip (> 5) -ggplot(data = comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_01620500), color="red3")+ - geom_line(aes(y=r_01634500), color="orange2")+ - geom_line(aes(y=r_02021500), color="yellowgreen")+ - geom_line(aes(y=r_02044500), color="dodgerblue4")+ - geom_line(aes(y=r_02054530), color="purple3")+ - geom_line(aes(y=r_03165000), color="orchid3")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("July Decrease in Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(5,9)+ - ylim(0,1) - -# April Peak (not > 5) -ggplot(data = comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_02021500), color="yellowgreen")+ - geom_line(aes(y=r_02034000), color="dodgerblue")+ - geom_line(aes(y=r_02054530), color="purple3")+ - geom_line(aes(y=r_03165000), color="orchid3")+ - geom_line(aes(y=r_03176500), color="hotpink")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("April Peak Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(2,6)+ - ylim(0,1) - - -# February Peak (not > 5) -ggplot(data = comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_01669000), color="gold1")+ - geom_line(aes(y=r_02031000), color="darkturquoise")+ - geom_line(aes(y=r_02054530), color="purple3")+ - geom_line(aes(y=r_03176500), color="hotpink")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("February Peak Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(1,4)+ - ylim(0,1) - - -# October Peak (not > 5) -ggplot(data = comp_ratings, - mapping = aes(x=mo))+ - geom_line(aes(y=r_01620500), color="red3")+ - geom_line(aes(y=r_02021500), color="yellowgreen")+ - geom_line(aes(y=r_02031000), color="darkturquoise")+ - geom_line(aes(y=r_02044500), color="dodgerblue4")+ - geom_line(aes(y=r_03165000), color="orchid3")+ - theme_bw()+ - xlab("Month")+ - ylab("R-Squared")+ - ggtitle("October Peak Monthly R-Squared")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlim(8,12)+ - ylim(0,1) - - -# Data frame with monthly mean r-squared value -monthly_mean <- data.frame(rowMeans(comp_ratings[,2:11])) -monthly_mean$mo <- c(1:12) -monthly_mean$season <- ifelse (monthly_mean$mo %in% c(6:8), - "SUMMER", - ifelse (monthly_mean$mo %in% c(9:11), - "AUTUMN", - ifelse (monthly_mean$mo %in% c(12,1,2), - "WINTER", - ifelse (monthly_mean$mo %in% c(3:5), - "SPRING", NA)))) -monthly_mean <- monthly_mean[, c(2,1)] -colnames(monthly_mean)[2] <- "mean_r_squared" - - - - -# Add season to comp data -comp_ratings$season <- ifelse (comp_ratings$mo %in% c(6:8), - "SUMMER", - ifelse (comp_ratings$mo %in% c(9:11), - "AUTUMN", - ifelse (comp_ratings$mo %in% c(12,1,2), - "WINTER", - ifelse (comp_ratings$mo %in% c(3:5), - "SPRING", NA)))) - -library(ggbeeswarm) - -ggplot(data = pivot_longer(comp_ratings,2:21))+ - aes(y = season, x=value, color=season)+ - geom_boxplot()+ - xlim(0,1)+ - theme_bw()+ - xlab("R-Squared")+ - ylab("Season")+ - ggtitle("Seasonal R-Squared Values")+ - theme(plot.title = element_text(hjust = 0.5), legend.position = "none") - - -ggplot(data = pivot_longer(comp_ratings,2:21))+ - aes(y = mo, x=value)+ - geom_boxplot(mapping = aes(group = mo))+ - xlim(0,1)+ - theme_bw()+ - xlab("R-Squared")+ - ylab("Month")+ - ggtitle("Monthly R-Squared Values")+ - theme(plot.title = element_text(hjust = 0.5), legend.position = "none")+ - scale_y_continuous(breaks = seq(0,12,1)) - - - - -# Plot monthly average rsquared value -ggplot(data = monthly_mean, - mapping = aes(x=mo))+ - geom_line(aes(y=mean_r_squared), color="hotpink3")+ - theme_bw()+ - xlab("Month")+ - ylab("Mean R-Squared")+ - ggtitle("Monthly Average R-Squared Values")+ - theme(plot.title = element_text(hjust = 0.5))+ - ylim(0,1) - - - - - - - - - - - - - diff --git a/HARP-2024-2025/VAHydro_metric_script.R b/HARP-2024-2025/VAHydro_metric_script.R deleted file mode 100644 index 89330e30..00000000 --- a/HARP-2024-2025/VAHydro_metric_script.R +++ /dev/null @@ -1,116 +0,0 @@ -# Inputs (args): -# 1 = path to precip data you want to use -# 2 = path of new csv -suppressPackageStartupMessages(library(dataRetrieval)) -suppressPackageStartupMessages(library(lubridate)) -suppressPackageStartupMessages(library(sqldf)) -suppressPackageStartupMessages(library(dplyr)) -args <- commandArgs(trailingOnly = TRUE) -if (length(args) != 2) { - message("Usage: Rscript usgsdata.R gage_id output_path") - q() -} - -read_path <- args[1] -write_path <- args[2] - -message("Pull csv from input file path") - -df <- read.csv(read_path) - -summary_analytics <- function(df){ - - #creating a yearly summary with each year and its total precip - yearly.summary <- - sqldf( - "SELECT yr, SUM(precip_in) AS total_precip - FROM df - GROUP BY yr" - ) - - #summary analytics - - precip_annual_max_in <- - max(yearly.summary$total_precip) - - precip_annual_max_year <- - yearly.summary$yr[which.max(yearly.summary$total_precip)] - - precip_annual_mean_in <- - mean(yearly.summary$total_precip) - - - #For min values and years, we can exclude the first and last row since the - #current year and first years are incomplete data - precip_annual_min_in <- - min(yearly.summary$total_precip[c(-nrow(yearly.summary),-1)]) - precip_annual_min_year <- - yearly.summary$yr[which.min - (yearly.summary$total_precip[c(-nrow - (yearly.summary),-1)])] - - - - # Create daily summary to use for all data. This makes hourly data daily sums. - daily.summary <- - sqldf( - "SELECT yr, mo, da, obs_date, SUM(precip_in) AS total_precip - FROM df - GROUP BY yr, mo, da" - ) #include obs_date after SQL to data frame - precip_daily_max_in <- max(daily.summary$total_precip) - #move to beginning - - - #if else statement evaluates the amount of unique hours and if 24, - #then hourly max is taken. If not, hourly max is NA - if(length(unique(df$hr)) == 24){ - precip_hourly_max_in <- max(df$precip_in) - } else { - precip_hourly_max_in <- NA - } - #Alternatively to a null value for hourly precip in daily data, - #we could look at the rainfall distribution table for a - #type II storm and multiply by the max P(t)/P(24) value. - #this function uses NULL for daily data - - - #l90 using zoo package - l90_precip_in_roll <- - min(rollapply(daily.summary$total_precip, width = 90, FUN = mean, - fill = NA, align = "right"), - na.rm = TRUE) - - #l90 using IHA - if(length(unique(df$hr)) == 24){ - Qout_zoo <- zoo(as.numeric(daily.summary$total_precip), order.by = as.POSIXct(daily.summary$obs_date)) - Qout_g2 <- data.frame(group2(Qout_zoo)) - l90_precip_in <- min(Qout_g2$X90.Day.Min) - } else { - Qout_zoo <- zoo(as.numeric(df$precip_in), order.by = as.POSIXct(df$obs_date)) - Qout_g2 <- data.frame(group2(Qout_zoo)) - l90_precip_in <- min(Qout_g2$X90.Day.Min) - } - - - - #makes data frame with all 9 metrics - metrics<- data.frame(precip_annual_max_in = precip_annual_max_in, - precip_annual_max_year = precip_annual_max_year, - precip_annual_mean_in = precip_annual_mean_in, - precip_annual_min_in = precip_annual_min_in, - precip_annual_min_year = precip_annual_min_year, - precip_daily_max_in = precip_daily_max_in, - precip_hourly_max_in = precip_hourly_max_in, - l90_precip_in = l90_precip_in, - l90_precip_in_roll = l90_precip_in_roll) -} - - -summary_analytics(df) -#What would be the best way to format the output csv? - - -message(paste0("Write csv in new file path: ",write_path)) -write.csv(precip_metric,write_path) - diff --git a/HARP-2024-2025/access-db.R b/HARP-2024-2025/access-db.R deleted file mode 100644 index 05a20cf5..00000000 --- a/HARP-2024-2025/access-db.R +++ /dev/null @@ -1,167 +0,0 @@ -library("RPostgres") -library("sqldf") -library("dataRetrieval") - -# works with only Rpostgres. This is the connection string to the database. -# Using this allows us to directly query the database from R, rather than -# needing to save results via SQL and send off text files with results. These -# connection strings are similar for all databases and rely on the permissions -# within the db -con <- dbConnect( - RPostgres::Postgres(), - dbname = "drupal.dh03", - host = "deq1.bse.vt.edu", - port = 5431, - user = readline("DB User Name: "), - password = getPass::getPass("REST Password: ") - ) - -#A function to get mean meteorology data for a given VA Hydro feature's geometry -#extent. In other words, this function can pull down meteorology data from our -#database (which covers the entire chesapeake Bay watershed), clip it to a given -#feature (like the drainage area of a gage or a parituclar watershed), and get -#the mean precipitation per day -get_feature_met <- function(con, - #The hydro code of the VA Hydro feature to clip the - #dataset to (the hydrocode is often from the base - #database and is not generated in VAHydro, whereas - #the feature id is the unique VA Hydro auto - #generated index number) - hydrocode, - #The variable key from VA Hydro for the met data of interest - varkey, - #The band of the raster. Important for NLDAS, PRISM - #and dyamet only have 1 band - band = 1, - #The feature associated with the meteorology data - #records. For precip, this is the cbp6_met_coverage - #generally (the CB p6 watershed) - mcovcode = 'cbp6_met_coverage') { - q = paste0( - "select met.featureid, to_timestamp(met.tsendtime) as obs_date, - extract(month from to_timestamp(met.tsendtime)) as mo, - --Note the syntax used to first clip the raster to the requested feature - --prior to summarizing, and note that we call mean as if it is a method of - --the object of ST_summarystats - (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), ", - band, ", TRUE)).mean as precip_mm, - 0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), ", - band, ", TRUE)).mean as precip_in - from dh_feature as f - left outer join field_data_dh_geofield as fgeo - on ( - fgeo.entity_id = f.hydroid - and fgeo.entity_type = 'dh_feature' - ) - left outer join dh_variabledefinition as v - on (v.varkey = '", - varkey, "') - left outer join dh_feature as mcov - on ( - mcov.hydrocode = '", mcovcode, "' - ) - left outer join dh_timeseries_weather as met - on ( mcov.hydroid = met.featureid and met.varid = v.hydroid) - where f.hydrocode = '", hydrocode, "' - order by met.tsendtime -" - ) - dat <- sqldf(q, connection = con) - return(dat) -} - -# Inventory i.e show the number of records of a given meteorology data set per -# year and show the minimum and maximum dates tied to that data. We should -# expect to see 366 days on leap years and 365 days on non-leap years EXCEPT for -# daymet, which will ALWAYS have 365 days with 12-31 missing on leap years -varkey <- "nldas2_obs_hourly" -inv_met <- sqldf(paste0("select extract(year from to_timestamp(met.tsendtime)) as year, -min(to_timestamp(met.tsendtime)) as start_date, -max(to_timestamp(met.tsendtime)) as end_date, -count(*) -from ( - select met.tsendtime - from dh_feature as mcov - left outer join dh_variabledefinition as v - on ( - v.varkey = '",varkey,"' - ) - left outer join dh_timeseries_weather as met - on ( - mcov.hydroid = met.featureid and met.varid = v.hydroid - and met.entity_type = 'dh_feature' - ) - where mcov.hydrocode = 'cbp6_met_coverage' - and met.rast is not null - group by met.tsendtime -) as met -group by extract(year from to_timestamp(met.tsendtime)) -order by extract(year from to_timestamp(met.tsendtime))"), connection = con) - -# get all the tables from connection -dbListTables(con) - -#Set a USGS gage ID -gageid = '01634000' -#Format the ID to match the hydrocode used in the VA Hydro database -hydrocode = paste0('usgs_ws_', gageid) -#Get PRISM data, specified via the varkey argument in get_feature_met, and clip -#to the gage drainage area, specified via the hydrocode. Get the mean precip in -#the watershed for each day -prism_data <- get_feature_met(con, hydrocode, 'prism_mod_daily') - -#Add more details about the date as needed. Note that format relies on a date -#defined via https://pubs.opengroup.org/onlinepubs/009695399/utilities/date.html -prism_data[,c('yr', 'mo', 'da')] <- cbind(format(prism_data$obs_date,"%Y"), - format(prism_data$obs_date,"%m"), - format(prism_data$obs_date,"%d")) -#Repeat the process, this time pulling data from the daymet data source -daymet_data <- get_feature_met(con, hydrocode, 'daymet_mod_daily') -daymet_data[,c('yr', 'mo', 'da')] <- cbind(format(daymet_data$obs_date,"%Y"), - format(daymet_data$obs_date,"%m"), - format(daymet_data$obs_date,"%d")) -# Get USGS gage data from NWIS, the USGS REST services. We pull this data using -# the dataRetrieval R package which has extensive documentation on the options -# therewithin. See https://waterdata.usgs.gov/blog/dataretrieval/. Below, we -# a cached version of the function defined in our config.R file: -gage_info <- memo_readNWISsite(gageid) -#Extract the drainage area of the gage -da <- gage_info$drain_area_va -#Get daily, mean streamflow (pcode 000060, see dataRetrieval::parameterCdFile) -#from the gage specified in gageid -usgs_data <- memo_readNWISdv(gageid,'00060') -#As above, add more details about the dates -usgs_data[,c('yr', 'mo', 'da')] <- cbind(format(usgs_data$Date,"%Y"), - format(usgs_data$Date,"%m"), - format(usgs_data$Date,"%d")) - -#Join all data together. In other words, extract the relevant fields from the -#three data sources and combine them based on the day, month, and year. We could -#do this by joining by the dates, but you may notice that the USGS daily data -#may not come in as POSIX and doesn't have a time associated with it. -comp_data <- sqldf( - "select a.obs_date, a.precip_in as prism_p_in, - b.precip_in as daymet_p_in, c.X_00060_00003 as usgs_cfs - from prism_data as a - left outer join daymet_data as b - on ( - a.yr = b.yr - and a.mo = b.mo - and a.da = b.da - ) - left outer join usgs_data as c - on ( - a.yr = c.yr - and a.mo = c.mo - and a.da = c.da - ) - " -) -#If you are familiar with tidyverse verbiage, you may wonder why we use SQLDF -#here instead of a few dplyr::left_join()s and dplyr::select(). The reason is -#fairly simple. Our team is more well-versed in SQL than in tidyverse and -#tidyverse has a tendency to change these verbs slightly over time. Most of the -#time, this is not an issue. But, some of the more obscure commands often get -#depreceated. Look at the history of dplyr::across() as an example. Even now, -#the joins were changed just a few versions ago and will soon block certain code -#from executing without directly specifying all inputs! diff --git a/HARP-2024-2025/access-file.R b/HARP-2024-2025/access-file.R deleted file mode 100644 index 11d1662d..00000000 --- a/HARP-2024-2025/access-file.R +++ /dev/null @@ -1,418 +0,0 @@ -# Test datasets CSV -library("sqldf") -library("dataRetrieval") -library("lubridate") - -#Set the id for the USGS gage from which we will pull data and use to summarize -#our meteorology data. Currently, we have data set up for: -#Culpepper 01667500, Strasburg 01634000, Ruckersville 01665500 -gageid = '01665500' -# We redefine the gage id as a "hydrocode", which is how it is stored in the VA -#Hydro (DEQ) database -hydrocode = paste0('usgs_ws_', gageid) -#Pull data from VA Hydro, grabbing some precip data for the gage watershed -prism_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-prism-all.csv")) -#Use lubridate to add on some additional information about the date. These dates -#come in as posix and thus have timestamps, which make them a little trickier to -#work with. So, we can disaggregate them for easier joins and manipulation -prism_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(prism_data$obs_date)), - month(as.Date(prism_data$obs_date)), - day(as.Date(prism_data$obs_date)), - week(as.Date(prism_data$obs_date))) - -#Repeat the download and date disaggregation for daymet, a different precip -#source: -daymet_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-daymet-all.csv")) -daymet_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(daymet_data$obs_date)), - month(as.Date(daymet_data$obs_date)), - day(as.Date(daymet_data$obs_date)), - week(as.Date(daymet_data$obs_date))) -#Repeat the download and date disaggregation for NLDAS2, an HOURLY precip source -nldas2_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-nldas2-all.csv")) -nldas2_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(nldas2_data$obs_date)), - month(as.Date(nldas2_data$obs_date)), - day(as.Date(nldas2_data$obs_date)), - week(as.Date(nldas2_data$obs_date))) -#NLDAS is an hourly precip data set. We should aggregate it to be daily sums to -#match the data in PRISM and daymet for the purpose of comparison. We can do -#this in SQL by grouping by the the date columns we created for month, day, and -#year -nldas2_data <- sqldf( - "select featureid, min(obs_date) as obs_date, yr, mo, da, - sum(precip_mm) as precip_mm, sum(precip_in) as precip_in - from nldas2_data - group by yr, mo, da - order by yr, mo, da - " -) - - -# Get USGS gage data from NWIS, the USGS REST services. We pull this data using -# the dataRetrieval R package which has extensive documentation on the options -# therewithin. See https://waterdata.usgs.gov/blog/dataretrieval/. Below, we -# a cached version of the function defined in our config.R file: -gage_info <- memo_readNWISsite(gageid) -#Extract the drainage area of the gage -da <- gage_info$drain_area_va -#Get daily, mean streamflow (pcode 000060, see dataRetrieval::parameterCdFile) -#from the gage specified in gageid -usgs_data <- memo_readNWISdv(gageid,'00060') -#Extract date information from the gage using lubridate as above -usgs_data[,c('yr', 'mo', 'da')] <- cbind(year(as.Date(usgs_data$Date)), - month(as.Date(usgs_data$Date)), - day(as.Date(usgs_data$Date))) - -#Join all data together. In other words, extract the relevant fields from the -#three data sources and combine them based on the day, month, and year. We could -#do this by joining by the dates, but you may notice that the USGS daily data -#may not come in as POSIX and doesn't have a time associated with it. We could -#do this via tidyverse verbs, but our team typically prefers SQL as it tends to -#change a lot less frequently i.e. will require less long-term maintenance -comp_data <- sqldf( - "select a.obs_date, a.precip_in as prism_p_in, - a.yr, a.mo, a.da, a.wk, - b.precip_in as daymet_p_in, d.precip_in as nldas2_p_in, - c.X_00060_00003 as usgs_cfs - from prism_data as a - left outer join daymet_data as b - on ( - a.yr = b.yr - and a.mo = b.mo - and a.da = b.da - ) - left outer join usgs_data as c - on ( - a.yr = c.yr - and a.mo = c.mo - and a.da = c.da - ) - left outer join nldas2_data as d - on ( - a.yr = d.yr - and a.mo = d.mo - and a.da = d.da - ) - order by a.yr, a.mo, a.da - " -) - -# Index the dataset, giving a unique number to each row. Please note that these -# are already available via rownames(comp_data) but here we add it as a column -# to call them in SQLDF. You can see how SQL changes our R workflows. As the -# project moves forward, feel free to branch out and try other workflows but -# please note we will often encourage the use of SQL as we assist in tasks. But -# all code is acceptable if it is documented and works! -comp_data$dataset_day <- index(comp_data) - -#Add a column for the date since this is now daily data -comp_data$date <- as.Date(comp_data$obs_date) - -#Here, we add in columns to represent tomorrow's flow and the change in flow -#from today to yesterday and today and tomorrow. This give an indication of how -#flow is changing. We can do so using R's indexing, simply calling the next rows -#down where needed -# comp_data$nextday_usgs_cfs <- c(comp_data$usgs_cfs[2:nrow(comp_data)],NA) -# comp_data$nextday_d_cfs <- comp_data$nextday_usgs_cfs - comp_data$usgs_cfs -# comp_data$today_d_cfs <- comp_data$usgs_cfs - c(NA,comp_data$usgs_cfs[1:(nrow(comp_data) - 1)]) - -#We could alternatively get this via an SQL query that runs just about as -#quickly! Note the use of the Lag() function and its inputs in the subquery used -#to pull in yesterdays and todays data -comp_data <- sqldf( - "select b.*, - b.nextday_usgs_cfs - b.usgs_cfs as nextday_d_cfs, - b.usgs_cfs - b.yesterday_usgs_cfs as today_d_cfs - FROM ( - select - a.*, - Lag(a.usgs_cfs,1) OVER(ORDER BY a.dataset_day) as yesterday_usgs_cfs, - Lag(a.usgs_cfs,-1) OVER(ORDER BY a.dataset_day) as nextday_usgs_cfs - from comp_data as a - order by a.dataset_day - ) as b - " -) - -# Add a cfs Precip column using the following conversions for EACH of the precip -# data sources: -# acre-feet/day = 3.07 MGD -# cfs = 1.57 MGD -# DAsqmi * 640 ac/sqmi * p_in_per_day / 12.0 = Precip ac-ft/day -# P acft/day / 3.07 = P mgd -# P mgd * 1.572 = P cfs -# 1.572 * (DAsqmi * 640.0 * p_in / 12.0) / 3.07 -comp_data$prism_p_cfs <- 1.572 * (da * 640.0 * comp_data$prism_p_in / 12.0) / 3.07 -comp_data$daymet_p_cfs <- 1.572 * (da * 640.0 * comp_data$daymet_p_in / 12.0) / 3.07 -comp_data$nldas2_p_cfs <- 1.572 * (da * 640.0 * comp_data$nldas2_p_in / 12.0) / 3.07 - -#We have a lot of daily data in comp_data, but we suspect to see more noteable -#trends in weekly data compared to daily data given travel time and -#abstraction/storage considerations. So, we can summarize our data by grouping -#it via the week variable we added in earlier using lubridate::week. -#(again, you may have done this previously via dplyr::group_by and summarize) -week_data <- sqldf( - "select min(obs_date) as week_begin, yr, wk, min(dataset_day) as dataset_day_begin, - avg(daymet_p_in) as daymet_p_in, avg(daymet_p_cfs) as daymet_p_cfs, - avg(prism_p_in) as prism_p_in, avg(prism_p_cfs) as prism_p_cfs, - avg(nldas2_p_in) as nldas2_p_in, avg(nldas2_p_cfs) as nldas2_p_cfs, - avg(usgs_cfs) as usgs_cfs, avg(today_d_cfs) as today_d_cfs, - avg(nextday_d_cfs) as nextday_d_cfs - from comp_data - group by yr, wk - order by yr, wk - " -) -#Base the month column off of the min day oindex for that week, or the day that -#week began -week_data$mo <- month(week_data$week_begin) - - -#Why does daymet generally procude worse relationships? Isn't it similar to -#PRISM? What is the relationship between PRISM and daymet as we expected these -#to be relatively similar -comparePrecip <- lm(comp_data$daymet_p_cfs ~ comp_data$prism_p_cfs) -summary(comparePrecip)$adj.r.squared -plot(comp_data$daymet_p_cfs ~ comp_data$prism_p_cfs) -#What if we offset the PRSIM data by one day? -compareNextDayPRISM <- lm(comp_data$daymet_p_cfs[1:(nrow(comp_data) - 1)] ~ - comp_data$prism_p_cfs[2:nrow(comp_data)]) -#Much improved. More like we expected, seems like daymet is offset. -summary(compareNextDayPRISM)$adj.r.squared -plot(comp_data$daymet_p_cfs[1:(nrow(comp_data) - 1)] ~ - comp_data$prism_p_cfs[2:nrow(comp_data)]) - -#Add a column with the offset daymet data -comp_data$daymet_p_cfs_nextDay <- c(NA,comp_data$daymet_p_cfs[1:(nrow(comp_data) - 1)]) - -#Create a simple linear regression of flow and precip with flow as the dependent -#variable to test if there is a clear and easy relationship between flow and -#precip. Using the formula notation "~" we can specify the regression as -#dependent ~ independent variable as long as the data option is specified in the -#function -mod_prism <- lm(usgs_cfs ~ prism_p_cfs, data = comp_data) -#Look at the regression paramters. Identify how well the regression performed -#(spoiler alert, it performed poorly, which makes sense given time of travel and -#abstraction/storage. We wouldn't want it to be easy!) -summary(mod_prism) - -#Repeated for daymet, running the linear regression between the USGS data and -#daymet -mod_daymet <- lm(usgs_cfs ~ daymet_p_cfs, data = comp_data) -#Evaluate the performance of the regression -summary(mod_daymet) - -# Weekly cfs vs P. Above, we created linear regressions for flow and -# precipitations. Now we repeat this process to see if weekly flow/precip is -# correlated more strongly. We'd suspect this to be a bit more reliably related, -# but how much stronger? -mod_week_prism <- lm(usgs_cfs ~ prism_p_cfs, data=week_data) -summary(mod_week_prism) -#Looking at the results, we see some correlation but it's still not very strong -plot(mod_week_prism$model$usgs_cfs ~ mod_week_prism$model$prism_p_cfs) -mod_week_daymet <- lm(usgs_cfs ~ daymet_p_cfs, data=week_data) -summary(mod_week_daymet) -plot(mod_week_daymet$model$usgs_cfs ~ mod_week_daymet$model$daymet_p_cfs) - - -# Let's repeat the regression anlysis now for ONLY January, which tends to be a -# wetter month. We start with the daily data. For PRISM: -# PRISM -mod_prism_jan <- lm(usgs_cfs ~ prism_p_cfs, data=comp_data[(comp_data$mo == 1),]) -summary(mod_prism_jan) -# daymet -mod_daymet_jan <- lm(usgs_cfs ~ daymet_p_cfs_nextDay, data=comp_data[(comp_data$mo == 1),]) -summary(mod_daymet_jan) -#Plot the daily data for january and see how the flow relates to the summarized -#precip data. This is equivalent to comp_data$usgs_cfs and comp_data$daymet_p_cfs -plot(mod_daymet_jan$model$usgs_cfs ~ mod_daymet_jan$model$daymet_p_cfs_nextDay) - -# January, next day flow todays precip -mod_prism_jan_nd <- lm(nextday_usgs_cfs ~ prism_p_cfs, - data=comp_data[(comp_data$mo == 1),]) -summary(mod_prism_jan_nd) - -#Repeat regression, but now compare the CHANGE in flow between today and -#tomorrow with TODAYs precipitation. Again, we stick with just January for now. -mod_prism_jan_ndd <- lm(nextday_d_cfs ~ prism_p_cfs, - data=comp_data[(comp_data$mo == 1),]) -summary(mod_prism_jan_ndd) -plot(mod_prism_jan_ndd$model$nextday_d_cfs ~ mod_prism_jan_ndd$model$prism_p_cfs) - -# Repeat the above analysis for daymet e.g. the CHANGE in flow b/t today and -# tomorrow and TODAY's daymet precipitation -mod_daymet_jan_ndd <- lm(nextday_d_cfs ~ daymet_p_cfs_nextDay, - data=comp_data[which(comp_data$mo == 1),]) -summary(mod_daymet_jan_ndd) -plot(mod_daymet_jan_ndd$model$nextday_d_cfs ~ mod_daymet_jan_ndd$model$daymet_p_cfs) - -# Days with zero rainfall may be skewing our results based on the plots above. -# So, repeat the analysis now only comparing against change in flows on the -# same day as non-zero rain -# *** DAYMET -mod_daymet_jan_nz_ndd <- lm(nextday_d_cfs ~ daymet_p_cfs_nextDay, - data = comp_data[((comp_data$mo == 1) & (comp_data$daymet_p_cfs > 0)),]) -summary(mod_daymet_jan_nz_ndd) -plot(mod_daymet_jan_nz_ndd$model$nextday_d_cfs ~ mod_daymet_jan_nz_ndd$model$daymet_p_cfs) -# *** PRISM -mod_prism_jan_nz_ndd <- lm(nextday_d_cfs ~ prism_p_cfs, - data = comp_data[which((comp_data$mo == 1) & (comp_data$prism_p_cfs > 0)),]) -summary(mod_prism_jan_nz_ndd) -plot(mod_prism_jan_nz_ndd$model$nextday_d_cfs ~ mod_prism_jan_nz_ndd$model$prism_p_cfs) - -# Now, we look at the change in flow on the day after it rains to the summarized -# precipitation. So, repeate the same ananlysis but now compare to nextday flow -# difference -# *** DAYMET -mod_daymet_jan_nz_ndd <- lm(nextday_d_cfs ~ daymet_p_cfs_nextDay, - data = comp_data[((comp_data$mo == 1) & (comp_data$daymet_p_cfs > 0)),]) -summary(mod_daymet_jan_nz_ndd) -plot(mod_daymet_jan_nz_ndd$model$nextday_d_cfs ~ mod_daymet_jan_nz_ndd$model$daymet_p_cfs) -# *** PRISM -mod_prism_jan_nz_ndd <- lm(nextday_d_cfs ~ prism_p_cfs, - data = comp_data[((comp_data$mo == 1) & (comp_data$prism_p_cfs > 0)),]) -summary(mod_prism_jan_nz_ndd) -plot(mod_prism_jan_nz_ndd$model$nextday_d_cfs ~ mod_prism_jan_nz_ndd$model$prism_p_cfs) - -# Now, we look at the change in flow on the day after it rains to the summarized -# precipitation. But, this time only look for days where there is some change in -# flow -# *** DAYMET -mod_daymet_jan_nz_cdd <- lm(nextday_d_cfs ~ daymet_p_cfs_nextDay, - data = comp_data[((comp_data$mo == 1) & (comp_data$nextday_d_cfs > 0)),]) -summary(mod_daymet_jan_nz_cdd) -plot(mod_daymet_jan_nz_ndd$model$nextday_d_cfs ~ mod_daymet_jan_nz_ndd$model$daymet_p_cfs_nextDay) -# *** PRISM -mod_prism_jan_nz_ndd <- lm(nextday_d_cfs ~ prism_p_cfs, - data = comp_data[((comp_data$mo == 1) & (comp_data$prism_p_cfs > 0)),]) -summary(mod_prism_jan_nz_ndd) -plot(mod_prism_jan_nz_ndd$model$nextday_d_cfs ~ mod_prism_jan_nz_ndd$model$prism_p_cfs) - - -#Try the above analysis (compare change in next day flow to precip only on days -#where change in flow is positive), but try for February which is another wet -#month -mod_prism_mon_nz_ndd <- lm(nextday_d_cfs ~ prism_p_cfs, - data = comp_data[which((comp_data$mo == 2) & (comp_data$nextday_d_cfs > 0)),]) -summary(mod_prism_mon_nz_ndd) -plot(mod_prism_mon_nz_ndd$model$nextday_d_cfs ~ mod_prism_mon_nz_ndd$model$prism_p_cfs) - -#The correlations above are decent. Let's see what the relationship looks like -#across all months of the year -# do all months and assemble a barplot of R^2 - -# create a class to hold both stats and the plot to make it easier to do comparisons later -plotBin <- R6::R6Class( - "plotBin", - public = list( - plot = NULL, data=list(), atts=list(), - initialize = function(plot = NULL, data = list()){ - self.plot = plot; self.data=data; - } - ) -) -# Week -mon_lm <- function(sample_data, y_var, x_var, mo_var, data_name){ - plot_out <- plotBin$new(data = sample_data) - nwd_stats <- data.frame(row.names=c('month', 'rsquared_a')) - for (i in 1:12) { - mo_data=sample_data[which((sample_data[,mo_var] == i)),] - weekmo_data <- lm(mo_data[,y_var] ~ mo_data[,x_var]) - dsum <- summary(weekmo_data) - nwd_stats <- rbind(nwd_stats, data.frame(i, dsum$adj.r.squared)) - } - plot_out$atts[['stats']] <- nwd_stats - barplot( - nwd_stats$dsum.adj.r.squared ~ nwd_stats$i, - ylim=c(0,1.0), - main=paste("lm(Q ~ P), monthly,",data_name) - ) - plot_out$plot <- recordPlot() - return(plot_out) -} - -nldas2_lm <- mon_lm(week_data, "nldas2_p_cfs", "usgs_cfs", "mo", "nldas2") -nldas2_lm$atts -nldas2_lm$plot - -prism_lm <- mon_lm(week_data, "prism_p_cfs", "usgs_cfs", "mo", "prism") -prism_lm$atts -prism_lm$plot - - -daymet_lm <- mon_lm(week_data, "daymet_p_cfs", "usgs_cfs", "mo", "daymet") -daymet_lm$atts -daymet_lm$plot - -# NLDAS2 -# do all months and assemble a barplot of R^2 -nldaswk_stats <- data.frame('month' = 1:12, 'rsquared_a' = numeric(12)) -for (i in 1:nrow(nldaswk_stats)) { - # Weekly d cfs vs P - mod_weekmo_nldas2_cfs <- lm(usgs_cfs ~ nldas2_p_cfs, - data=week_data[which((week_data$mo == i)),]) - dsum <- summary(mod_weekmo_nldas2_cfs) - #plot(mod_weekmo_nldas2_cfs$model$usgs_cfs ~ mod_weekmo_nldas2_cfs$model$nldas2_p_cfs) - - #mod_week_daymet_d_cfs <- lm(today_d_cfs ~ daymet_p_cfs, data=week_data) - #summary(mod_week_daymet_d_cfs) - #plot(mod_week_daymet_d_cfs$model$today_d_cfs ~ mod_week_daymet_d_cfs$model$daymet_p_cfs) - nldaswk_stats$rsquared_a[i] <- dsum$adj.r.squared -} -barplot(nldaswk_stats$dsum.adj.r.squared ~ nldaswk_stats$i, - main=paste(gage_info$station_nm ), ylim=c(0,1.0)) -daymet_lm <- mon_lm(week_data, "daymet_p_cfs", "usgs_cfs", "mo", "daymet") -daymet_lm$atts -daymet_lm$plot - - -#Binomial = Based on x, did Y occur? Looking at weekly rainfall totals, did a -#storm hydrograph occur? -#Poisson = Counts. Based on x, how many times did Y occur? Looking at weekly -#rainfall totals, how many storms should we have expected to see? - -#Are rollowing rainfall sums relevant to this analysis? -comp_data$rollingSumDaymet <- c(NA,NA,rollapply(comp_data$daymet_p_cfs_nextDay,3,sum, - align = "left")) -comp_data$rollingSumPRISM <- c(NA,NA,rollapply(comp_data$prism_p_cfs,3,sum, - align = "left")) -comp_data$rollingSumUSGS_prev <- c(NA,NA,rollapply(comp_data$usgs_cfs,3,sum, - align = "left")) -comp_data$rollingSumUSGS_next <- c(rollapply(comp_data$usgs_cfs,3,sum, - align = "left"),NA,NA) -rollModelPRISM <- lm(usgs_cfs ~ rollingSumPRISM,data = comp_data) -summary(rollModelPRISM) -plot(log10(rollModelPRISM$model)) -#Nope, not really. Maybe only at higher flows, which is when we'd expect -#correlation. Correlation is improved for next day flow however. What if we look -#at rolling vs rolling? -rollModel <- lm(rollingSumUSGS_prev ~ rollingSumPRISM,data = comp_data) -summary(rollModel) - -#What if we increase the rolling duration? At what point do we produce a maximum -#correlation? We would assume this increases with rolling time period -rsq <- numeric(180) -rsq_winter <- numeric(180) -for(i in 1:180){ - testData <- comp_data - testData$rollingSumPRISM <- c(rep(NA,(i - 1)),rollapply(testData$prism_p_cfs,i,sum, - align = "left")) - testData$rollingSumUSGS_prev <- c(rep(NA,(i - 1)),rollapply(testData$usgs_cfs,i,sum, - align = "left")) - rollModeli <- lm(rollingSumUSGS_prev ~ rollingSumPRISM,data = testData) - rsq[i] <- summary(rollModeli)$adj.r.squared - - rollModel_winteri <- lm(rollingSumUSGS_prev ~ rollingSumPRISM,data = testData[testData$mo %in% c(11,12,1,2,3),]) - rsq_winter[i] <- summary(rollModel_winteri)$adj.r.squared -} - -plot(rsq_winter) -points(rsq_winter,col = "blue") - - -plot(as.Date(comp_data$obs_date), - comp_data$prism_p_cfs, - col = "red",type = "l") -lines(as.Date(comp_data$obs_date), - comp_data$usgs_cfs) - - diff --git a/HARP-2024-2025/assessEventPrecip.R b/HARP-2024-2025/assessEventPrecip.R deleted file mode 100644 index 0d6066e3..00000000 --- a/HARP-2024-2025/assessEventPrecip.R +++ /dev/null @@ -1,312 +0,0 @@ -# Test datasets CSV -library("sqldf") -library("dataRetrieval") -library("lubridate") - -#Set the id for the USGS gage from which we will pull data and use to summarize -#our meteorology data. Currently, we have data set up for: -#Culpepper 01667500, Strasburg 01634000, Ruckersville 01665500 -gageid = '01665500' -# We redefine the gage id as a "hydrocode", which is how it is stored in the VA -#Hydro (DEQ) database -hydrocode = paste0('usgs_ws_', gageid) - -#Pull data from VA Hydro, grabbing some precip data for the gage watershed -prism_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-prism-all.csv")) - -#Use lubridate to add on some additional information about the date. These dates -#come in as posix and thus have timestamps, which make them a little trickier to -#work with. So, we can disaggregate them for easier joins and manipulation -prism_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(prism_data$obs_date)), - month(as.Date(prism_data$obs_date)), - day(as.Date(prism_data$obs_date)), - week(as.Date(prism_data$obs_date))) - -#Repeat the download and date disaggregation for daymet, a different precip -#source: -daymet_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-daymet-all.csv")) -daymet_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(daymet_data$obs_date)), - month(as.Date(daymet_data$obs_date)), - day(as.Date(daymet_data$obs_date)), - week(as.Date(daymet_data$obs_date))) -#Repeat the download and date disaggregation for NLDAS2, an HOURLY precip source -nldas2_data <- read.csv(paste0("http://deq1.bse.vt.edu:81/files/met/", hydrocode, "-nldas2-all.csv")) -nldas2_data[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(nldas2_data$obs_date)), - month(as.Date(nldas2_data$obs_date)), - day(as.Date(nldas2_data$obs_date)), - week(as.Date(nldas2_data$obs_date))) -#NLDAS is an hourly precip data set. We should aggregate it to be daily sums to -#match the data in PRISM and daymet for the purpose of comparison. We can do -#this in SQL by grouping by the the date columns we created for month, day, and -#year -nldas2_data <- sqldf( - "select featureid, min(obs_date) as obs_date, yr, mo, da, - sum(precip_mm) as precip_mm, sum(precip_in) as precip_in - from nldas2_data - group by yr, mo, da - order by yr, mo, da - " -) - - -# Get USGS gage data from NWIS, the USGS REST services. We pull this data using -# the dataRetrieval R package which has extensive documentation on the options -# therewithin. See https://waterdata.usgs.gov/blog/dataretrieval/. Below, we -# a cached version of the function defined in our config.R file: -gage_info <- memo_readNWISsite(gageid) -#Extract the drainage area of the gage -da <- gage_info$drain_area_va -#Get daily, mean streamflow (pcode 000060, see dataRetrieval::parameterCdFile) -#from the gage specified in gageid -usgs_data <- memo_readNWISdv(gageid,'00060') -#Extract date information from the gage using lubridate as above -usgs_data[,c('yr', 'mo', 'da')] <- cbind(year(as.Date(usgs_data$Date)), - month(as.Date(usgs_data$Date)), - day(as.Date(usgs_data$Date))) - -#Join all data together. In other words, extract the relevant fields from the -#three data sources and combine them based on the day, month, and year. We could -#do this by joining by the dates, but you may notice that the USGS daily data -#may not come in as POSIX and doesn't have a time associated with it. We could -#do this via tidyverse verbs, but our team typically prefers SQL as it tends to -#change a lot less frequently i.e. will require less long-term maintenance -comp_data <- sqldf( - "select a.obs_date, a.precip_in as prism_p_in, - a.yr, a.mo, a.da, a.wk, - b.precip_in as daymet_p_in, d.precip_in as nldas2_p_in, - c.X_00060_00003 as usgs_cfs - from prism_data as a - left outer join daymet_data as b - on ( - a.yr = b.yr - and a.mo = b.mo - and a.da = b.da - ) - left outer join usgs_data as c - on ( - a.yr = c.yr - and a.mo = c.mo - and a.da = c.da - ) - left outer join nldas2_data as d - on ( - a.yr = d.yr - and a.mo = d.mo - and a.da = d.da - ) - order by a.yr, a.mo, a.da - " -) - -# Index the dataset, giving a unique number to each row. Please note that these -# are already available via rownames(comp_data) but here we add it as a column -# to call them in SQLDF. You can see how SQL changes our R workflows. As the -# project moves forward, feel free to branch out and try other workflows but -# please note we will often encourage the use of SQL as we assist in tasks. But -# all code is acceptable if it is documented and works! -comp_data$dataset_day <- index(comp_data) - -#Add a column for the date since this is now daily data -comp_data$date <- as.Date(comp_data$obs_date) - -#Here, we add in columns to represent tomorrow's flow and the change in flow -#from today to yesterday and today and tomorrow. This give an indication of how -#flow is changing. We can do so using R's indexing, simply calling the next rows -#down where needed -# comp_data$nextday_usgs_cfs <- c(comp_data$usgs_cfs[2:nrow(comp_data)],NA) -# comp_data$nextday_d_cfs <- comp_data$nextday_usgs_cfs - comp_data$usgs_cfs -# comp_data$today_d_cfs <- comp_data$usgs_cfs - c(NA,comp_data$usgs_cfs[1:(nrow(comp_data) - 1)]) - -#We could alternatively get this via an SQL query that runs just about as -#quickly! Note the use of the Lag() function and its inputs in the subquery used -#to pull in yesterdays and todays data -comp_data <- sqldf( - "select b.*, - b.nextday_usgs_cfs - b.usgs_cfs as nextday_d_cfs, - b.usgs_cfs - b.yesterday_usgs_cfs as today_d_cfs - FROM ( - select - a.*, - Lag(a.usgs_cfs,1) OVER(ORDER BY a.dataset_day) as yesterday_usgs_cfs, - Lag(a.usgs_cfs,-1) OVER(ORDER BY a.dataset_day) as nextday_usgs_cfs - from comp_data as a - order by a.dataset_day - ) as b - " -) - -# Add a cfs Precip column using the following conversions for EACH of the precip -# data sources: -# acre-feet/day = 3.07 MGD -# cfs = 1.57 MGD -# DAsqmi * 640 ac/sqmi * p_in_per_day / 12.0 = Precip ac-ft/day -# P acft/day / 3.07 = P mgd -# P mgd * 1.572 = P cfs -# 1.572 * (DAsqmi * 640.0 * p_in / 12.0) / 3.07 -comp_data$prism_p_cfs <- 1.572 * (da * 640.0 * comp_data$prism_p_in / 12.0) / 3.07 -comp_data$daymet_p_cfs <- 1.572 * (da * 640.0 * comp_data$daymet_p_in / 12.0) / 3.07 -comp_data$nldas2_p_cfs <- 1.572 * (da * 640.0 * comp_data$nldas2_p_in / 12.0) / 3.07 -#Can we learn anything based on what stormSep gives us? e.g. the number of -#storms that occur in a given week, month, day, etc.? -#First, create a dataset where USGS flow is not NA -source("C:/Users/gcw73279.COV/Desktop/gitBackups/OWS/HARParchive/HARP-2024-2025/stormSep_USGS.R") - -stormCompData <- comp_data[!is.na(comp_data$usgs_cfs),] -stormOut <- stormSeparate(as.Date(stormCompData$obs_date), - inflow = stormCompData$usgs_cfs, - plt = FALSE,path = paste0(getwd(),"/StormPlots/"), - allMinimaStorms = TRUE, - baselineFlowOption = "Month") - -#For each storm, sum precip leading up to it (past 3, 5, 7, and 14 days?). Go -#through each storm and find the sum of precip of each dataset. Also find past -#3, 5, 7, and 14 days from storm endpoint including its full duration? What -#about stream length? What about DA or other NHDPLus factors? Can we get at -#travel time and or lag? Land use? -stormStats <- stormOut$Stats -#Throw out any storms that have inconsistent durations compared to start and end -#date. This can occur when a gage goes offline as StormSep doesn't check for -#this -stormEvents <- stormOut$Storms[(as.Date(stormStats$endDate) - as.Date(stormStats$startDate) + 1) == stormStats$durAll] -stormStats <- stormStats[(as.Date(stormStats$endDate) - as.Date(stormStats$startDate) + 1) == stormStats$durAll,] - -#Get rolling sums of precip over 3 day periods: -comp_data$roll3PRISM_cfs <- rollapply(comp_data$prism_p_cfs,3,sum,fill = NA,align = "right") - -#A function that gets precip from the rollingDur period but will include the -#full stormDuration -getRollPrecip <- function(comp_data,stormDuration, - rollingDur,endDate, - precipColName = "prism_p_cfs"){ - #Convert input date to date, just in case - sDate <- as.Date(endDate) - #Get the index in comp_date df where the endDate occurs - dateIndex <- grep(endDate,comp_data$date) - #Get all values from the precipColName in comp_data for the target duration - #adjusted for the storm duration. So, if there is a five-day storm, - #stormDuration is 5. If we are interested in rolling 7-day precip prior to and - #throughout the storm, we'd want rollingDur = 7. So, we need dateIndex - 7 - 5 - #The precip data is in: - precipData <- comp_data[,precipColName] - precipStorm <- precipData[(dateIndex - rollingDur - stormDuration + 2) : dateIndex] - #Return total precip. Adjust for NAs that may occur due to indexing numbers - #prior to start of comp_data - totalPrecip <- sum(precipStorm,na.rm = TRUE) - return(totalPrecip) -} - -#Add to stormStats the sum of precipitation from the 3-, 7-, and 14-day periods -#leading up to the storm and including the full storm duration. Convert to MG -cfsToMGD <- 86400 * 12*12*12/231/1000000 -#Only precip during the storm iteself -stormStats$roll1DayWStormPRISM_MG <- mapply(SIMPLIFY = TRUE, USE.NAMES = FALSE, - FUN = getRollPrecip, stormDuration = stormStats$durAll, - endDate = stormStats$endDate, - MoreArgs = list(comp_data = comp_data,rollingDur = 1, - precipColName = "prism_p_cfs") -) -stormStats$roll1DayWStormPRISM_MG <- stormStats$roll1DayWStormPRISM_MG * cfsToMGD -#Includes 1-day prior to the storm: -stormStats$roll2DayWStormPRISM_MG <- mapply(SIMPLIFY = TRUE, USE.NAMES = FALSE, - FUN = getRollPrecip, stormDuration = stormStats$durAll, - endDate = stormStats$endDate, - MoreArgs = list(comp_data = comp_data,rollingDur = 2, - precipColName = "prism_p_cfs") -) -stormStats$roll2DayWStormPRISM_MG <- stormStats$roll2DayWStormPRISM_MG * cfsToMGD - -#Plots of total storm volume to rolling 3-, 7-, 14-day periods with storm duration -plot(stormStats$roll1DayWStormPRISM_MG,stormStats$volumeTotalMG, - log = "xy") -#Plots of total storm volume above base Q to rolling 3-, 7-, 14-day periods with -#storm duration -plot(stormStats$roll1DayWStormPRISM_MG,stormStats$volumeAboveBaseQMG) - -#Linear models with storm duration -withStorm1 <- lm(volumeTotalMG ~ roll1DayWStormPRISM_MG, - data = stormStats) - -#Linear models with stormDuration above base Q -withStorm1_aboveBaseQ <- lm(volumeAboveBaseQMG ~ roll1DayWStormPRISM_MG, - data = stormStats) -withStorm2_aboveBaseQ <- lm(volumeAboveBaseQMG ~ roll2DayWStormPRISM_MG, - data = stormStats) - -summary(withStorm1) -summary(withStorm1_aboveBaseQ) -summary(withStorm2_aboveBaseQ) - -#The following showed a strong correlation. What if we look at power regression -#to smooth over small and large differences e.g. take power away from larger -#storms? This also removes storms in which no precip was found in the target -#duration. This may skew results and should be reviewed carefully due to the -#bias of picking only data where this method "works" -withStorm1_aboveBaseQ_log <- lm(log(volumeAboveBaseQMG) ~ log(roll1DayWStormPRISM_MG), - data = stormStats[stormStats$roll1DayWStormPRISM_MG > 0,]) -#The log power regression in this case actually performs worse: -summary(withStorm1_aboveBaseQ_log) - -#However, are we weighted by large storm volumes? It appears so, with one value -#even falling outside of Cooks Distance -plot(withStorm1_aboveBaseQ) - -#Does relationsip improve at all when we look only at specific months. Add month -#based on the startDate of the storm -stormStats$beginMonth <- as.numeric(format(as.Date(stormStats$startDate),"%m")) - -#What are the monthly relationships? -monthEventOut1Day <- mon_lm(stormStats, y_var = "volumeAboveBaseQMG", - x_var = "roll1DayWStormPRISM_MG", - mo_var = "beginMonth", "Storm Event Vol") -monthEventOut2Day <- mon_lm(stormStats, y_var = "volumeAboveBaseQMG", - x_var = "roll2DayWStormPRISM_MG", - mo_var = "beginMonth", "Storm Event Vol") - -#Data counts? -table(stormStats$beginMonth) -sum(stormStats$beginMonth == 10) -testReg <- lm(log(volumeAboveBaseQMG) ~ log(roll1DayWStormPRISM_MG), - data = stormStats[stormStats$beginMonth==10,]) -summary(testReg) - -#There may be some heavy influence from high storm events. May want to consider -#power regressions or non-linear exponential regressions to reduce their -#influence or evens a Cooks Distance analysis to remove errant data points - -#For each storm in stormOut$Storms, use the storm start and end-date to find the -#7-day period leading up to the storm and the 7-day period following the storm. -#Show precip during this period and throughout storm duration. Then, highlight -#the separated storm hydrograph -for(i in 1:length(stormEvents)){ - print(i) - #Get the current storm flow data which will be used for highlighting that - #hydrograph - stormi <- stormEvents[[1]] - #Only need non-NA values since stormSep will output full timeseries but leave - #values as NA if they are not included in storm - stormi <- stormi[!is.na(stormi$flow),] - - #Get the start and end date of the storm from stormStats: - stormStart <- as.Date(stormStats$startDate[i]) - stormEnd <- as.Date(stormStats$endDate[i]) - #Adjust these values to increase the window: - plotStart <- stormStart - 7 - plotEnd <- stormEnd + 7 - - #Get the stream flow and baseflow from stormSep - flowData <- stormOut$flowData[stormOut$flowData$timestamp >= plotStart & - stormOut$flowData$timestamp <= plotEnd,] - #Join in the precip "flow" from comp_data: - flowDataAll <- sqldf("SELECT flowData.*, - comp.prism_p_in, - comp.daymet_p_in, - comp.nldas2_p_in, - comp.prism_p_cfs, - comp.daymet_p_cfs, - comp.nldas2_p_cfs - FROM flowData - LEFT JOIN comp_data as comp - ON flowData.timeStamp = comp.Date") - pathOut <- paste0("StormPlotsNew/stormPlot",i,".PNG") - plotStorm(pathOut,stormi,flowDataAll) -} diff --git a/HARP-2024-2025/caching function.R b/HARP-2024-2025/caching function.R deleted file mode 100644 index 36a3b20b..00000000 --- a/HARP-2024-2025/caching function.R +++ /dev/null @@ -1,33 +0,0 @@ -# set up a caching function for NWIS -Sys.setenv(USGS_cache_dir = "/media/model/usgs") - -hydro_tools <- 'https://raw.githubusercontent.com/HARPgroup/hydro-tools/master' -cbp6_location <- 'https://raw.githubusercontent.com/HARPgroup/cbp6/master' -hydro_tools_location <- hydro_tools -om_location <- "https://raw.githubusercontent.com/HARPgroup/om/master" -elfgen_location <- "https://raw.githubusercontent.com/HARPgroup/elfgen/master" -openmi_om_location <- "https://raw.githubusercontent.com/HARPgroup/openmi-om/master" -vahydro_location <- "https://github.com/HARPgroup/vahydro/blob/master" -github_location <- "C:/usr/local/home/git" -HARParchive_location <- paste0(github_location, "/HARParchive") -# foundation_location <- "C:\\Users\\faw18626\\OneDrive - Commonwealth of Virginia" -foundation_location <- paste0(Sys.getenv("USERPROFILE"),"\\OneDrive - Commonwealth of Virginia\\OWS\\foundation_datasets\\wsp\\wsp2020") -onedrive_location <- paste0(Sys.getenv("USERPROFILE"),"\\OneDrive - Commonwealth of Virginia") - -folder <- '/Workspace/tmp/' -export_path <- '/Workspace/tmp/' -base_url <- 'http://deq1.bse.vt.edu:81/d.dh' #needed for REST functions -save_url <- "http://deq1.bse.vt.edu:81/data/proj3/out"; -noaa_api_key <- 'OdnQxMwsNwbNTPiFJRygEwxflkOIrxhh' -options(noaakey = noaa_api_key) - -#### Old Stuff No Longer Needed? -vahydro_directory <- "/usr/local/home/git/r-dh-ecohydro/Analysis/fn_vahydro-2.0" -auth_directory <- "/usr/local/home/git/r-dh-ecohydro/ELFGEN/internal" -file_directory <- "/Workspace/tmp" -save_directory <- "/Workspace/tmp" -fxn_locations <- "/usr/local/home/git/r-dh-ecohydro/ELFGEN/internal/" -habitat_files <- "/usr/local/home/git/r-dh-ecohydro/Analysis/Habitat/" -fxn_vahydro <- "/usr/local/home/git/hydro-tools" -base_directory <- "/usr/local/home/git/r-dh-ecohydro" - diff --git a/HARP-2024-2025/dni_data_experiments.R b/HARP-2024-2025/dni_data_experiments.R deleted file mode 100644 index f461450f..00000000 --- a/HARP-2024-2025/dni_data_experiments.R +++ /dev/null @@ -1,9 +0,0 @@ - -# How to use SQL in R in a more flexible way than sqldf will permit -# https://dbi.r-dbi.org/reference/dbAppendTable.html -con <- dbConnect(RSQLite::SQLite(), ":memory:") -dbCreateTable(con, "dh_timeseries", ds$tsvalues) -dbWriteTable(con, "dh_timeseries", ds$tsvalues, overwrite = TRUE) -dbExecute(con, "insert into dh_timeseries(tid, tsvalue, featureid) values (2,4.5,10)") -dbExecute(con, "select * from dh_timeseries") -dbReadTable(con, "dh_timeseries") diff --git a/HARP-2024-2025/hydroimport_daily.R b/HARP-2024-2025/hydroimport_daily.R deleted file mode 100644 index f626c3a5..00000000 --- a/HARP-2024-2025/hydroimport_daily.R +++ /dev/null @@ -1,49 +0,0 @@ -# Inputs (args): -# 1 = File path of csv from VA Hydro -# 2 = Data source "nldas2, daymet, prism" -# 3 = End path of new csv -# Outputs: -# Csv file with manipulated data at end filepath - -# Library necessary packages -print("Accessing necessary libraries") -suppressPackageStartupMessages(library(lubridate)) -suppressPackageStartupMessages(library(sqldf)) - -# Set up command args -print("Reading command args") -args <- commandArgs(trailingOnly = TRUE) -if (length(args) != 3){ - message("Missing or extra inputs. Usage: Rscript hydroimport_daily.R input_path data_source output_path") - q() -} - -input_path <- args[1] -data_source <- args[2] -output_path <- args[3] - -# Pull csv from input file path -print("Reading csv") -hydro_daily <- read.csv(input_path) -# Add in more date information -print("Adding date information") -hydro_daily[,c('yr', 'mo', 'da', 'wk')] <- cbind(year(as.Date(hydro_daily$obs_date)), - month(as.Date(hydro_daily$obs_date)), - day(as.Date(hydro_daily$obs_date)), - week(as.Date(hydro_daily$obs_date))) - -# If data comes from nladas2 (hourly), it must be converted into daily data -print("Checking for nldas2") -if (data_source=="nldas2"){ - hydro_daily <- sqldf( - "select featureid, min(obs_date) as obs_date, yr, mo, da, - sum(precip_mm) as precip_mm, sum(precip_in) as precip_in - from hydro_daily - group by yr, mo, da - order by yr, mo, da - " - )} - -# Write csv in new file path -print("Writing csv") -write.csv(hydro_daily,file = output_path) \ No newline at end of file diff --git a/HARP-2024-2025/make_weekly_summary_ts.R b/HARP-2024-2025/make_weekly_summary_ts.R deleted file mode 100644 index 56930201..00000000 --- a/HARP-2024-2025/make_weekly_summary_ts.R +++ /dev/null @@ -1,61 +0,0 @@ -# Inputs (args): -# 1 = comp data csv filepath -# 2 = Output csv path -# 3 = data column - -# Libraries -suppressPackageStartupMessages(library(lubridate)) -suppressPackageStartupMessages(library(sqldf)) - - -args <- commandArgs(trailingOnly = T) - -if (length(args) != 3){ - message("Missing or extra inputs. Usage: Rscript make_weekly_summary_ts.R comp_data_filepath output_filepath data_column ") -} -# Missing from github issue:[weekly_column(default=weekly_mean_value)] -print("Assigning arguments") -comp_data_filepath <- args[1] -output_filepath <- args[2] -data_column <- args[3] - -print("Reading in comp data csv") -comp_data <- read.csv(comp_data_filepath) - -# Check for necessary data (Comp data doesnt have start and end date?) -print("Checking columns") -#if ("start_date" %in% colnames(comp_data)){}else{ - # message("start_date column missing") - # q() -#} - -#if ("end_date" %in% colnames(comp_data)){}else{ - #message("end_date column missing") - #q() -#} - -if (data_column %in% colnames(comp_data)){}else{ - message(data_column, "column missing") - q() -} - - -#converts our daily data into weekly -print("Converting to weekly data") - week_data <- sqldf( - "select min(obs_date) as week_begin, yr, wk, min(dataset_day) as dataset_day_begin, - avg(dataset_p_in) as dataset_p_in, avg(dataset_cfs) as dataset_cfs, - avg(usgs_cfs) as usgs_cfs, avg(today_d_cfs) as today_d_cfs, - avg(nextday_d_cfs) as nextday_d_cfs - from comp_data - group by yr, wk - order by yr, wk - " - ) - #Base the month column off of the min day oindex for that week, or the day that - #week began - week_data$mo <- month(week_data$week_begin) - - - write.csv(week_data,output_filepath) - diff --git a/HARP-2024-2025/retrieve_vahydro_metrics.R b/HARP-2024-2025/retrieve_vahydro_metrics.R deleted file mode 100644 index 2c4b9ecf..00000000 --- a/HARP-2024-2025/retrieve_vahydro_metrics.R +++ /dev/null @@ -1,12 +0,0 @@ -# GET VAHydro 1.0 RIVERSEG l90_Qout DATA -mdf <- data.frame( - 'model_version' = c('cbp-6.1'), - 'runid' = c('stormVol_prism'), - 'metric' = c('precip_annual_max_in'), - 'runlabel' = c('precip_annual_max_in') -) -met_data <- om_vahydro_metric_grid( - metric = metric, runids = mdf, bundle="watershed", ftype="usgs_full_drainage", - base_url = paste(site,'entity-model-prop-level-export',sep="/"), - ds = ds -) diff --git a/HARP-2024-2025/stormSep_USGS.R b/HARP-2024-2025/stormSep_USGS.R deleted file mode 100644 index 54261215..00000000 --- a/HARP-2024-2025/stormSep_USGS.R +++ /dev/null @@ -1,504 +0,0 @@ -# flowData <- dataRetrieval::readNWISdv("01613900",parameterCd = "00060") -# inflow <- flowData$X_00060_00003 -# timeIn <- as.Date(flowData$Date) -# mindex <- which(timeIn == "2019-01-01") -# maxdex <- which(timeIn == "2021-12-31") -# seqdex <- seq(mindex,maxdex) -# inflow <- inflow[seqdex] -# timeIn <- timeIn[seqdex] - - -#Input variables are: -# timeIn = Timestamps associated with streamflow data as vector -# inflow = Streamflow daily data as vector in cfs -# plt = Boolean to determine if plots are necessary -# path = Directory to store plots in. Used if plt is TRUE -# allMinimaStorms = Considers ALL local minima as potnetial storm endpoints. -# Will distinguish between peaks in the case of a multi-day storm hydrograph if -# set to TRUE. Otherwise, a storm is only considered as two subseuqnet local -# minima that fall below baseline flow -# baselineFlowOption = One of c("Water Year","Month","Calendar Year","Climate Year","All"). -# Defaults to "All". Determines how program will develop baseline flow. Uses -# horizontal line regression of baseflow and breaks it down based on all time, -# water year, calendar year, or month based on user selection -stormSeparate <- function(timeIn, inflow, - plt = F, - path = paste0(getwd(),"/"), - allMinimaStorms = FALSE, - baselineFlowOption = "All" -){ - #Call packages required by function if not called already: - require(grwat) - require(zoo) - require(sqldf) - - #First, get baseflow associated with inflow. Use defaults of grwat for now - #which involves three passes and the Lyne-Hollick (1979) hydrograph separation - #method with coefficient a as 0.925 - baseQ <- grwat::gr_baseflow(inflow) - - #Add timestamp to the baseflow separation for convenience by creating a - #dataframe - baseQ <- data.frame(timestamp = timeIn, Year = as.numeric(format(timeIn,"%Y")), - baseQ = baseQ,flow = inflow) - - #Find mins/maxes of three consecutive points such that the extreme is in the - #middle. These represent potential local mins and maxes - maxes <- rollapply(baseQ$flow,3,function(x) which.max(x) == 2,# & (x[2]!=x[1] & x[2]!= x[3]), - align = "left") - mins <- rollapply(baseQ$flow,3,function(x) which.min(x) == 2,# & (x[2]!=x[1] & x[2]!= x[3]), - align = "left") - #Create data frames of these local minima/maxima with the corresponding - #timestamp from the original data frame. Note that the rolling looks at the - #first three data points. If the first triplet has a local max as its second - #value, the first value in maxes will be TRUE.The timestamp to be associated - #with this should be that of the middle data point e.g. the second timestamp - #of the triplet! - mins <- data.frame(timestamp = baseQ$timestamp[c(FALSE,mins,FALSE)], - mins = baseQ$flow[c(FALSE,mins,FALSE)]) - maxes <- data.frame(timestamp = baseQ$timestamp[c(FALSE,maxes,FALSE)], - maxes = baseQ$flow[c(FALSE,maxes,FALSE)]) - - #Need to find mins/maxes below baseline flow. But first baseline flow must be - #defined. hreg() will try to fit mulitple horizontal lines through subset of - #data involving every point below that line until best fit is found. This - #becomes baseline flow, brk - #Find the break associated with this run and buffer by 10%. Based the hreg on - #the timescale selected by the user - # baselineFlowOption = One of c("Water Year","Month","Calendar Year","All") - if(baselineFlowOption == "All"){ - #Use the entire baseflow dataset to get baseline flow, brk - brk <- hreg(baseQ$baseQ,limit = 1) - brk <- brk * 1.1 - #Make as long as timeseries - brk <- rep(brk,nrow(baseQ)) - }else if(baselineFlowOption %in% c("Calendar Year", "Climate Year","Water Year")){ - #Based on the user option, determine the start month of the year - #designation. Water Years run from October - September, Climate years from - #April - March, and Calendar Years from January - December - if(baselineFlowOption == "Water Year"){ - WYS <- "10-01" - }else if(baselineFlowOption == "Climate Year"){ - WYS <- "04-01" - }else{ - WYS <- "01-01" - } - #Make dates of the input timesteps: - dataDates <- as.Date(timeIn) - #Create an emtpty vector for the new annual designations - dataWY <- NA - - #Identify months in the next year that are classified as the previous year's - #water/climate year e.g. January 2021 is in Water Year 2020 that begins in - #October. Get the month using gsub to get only the characters prior to first - #dash - WYM <- as.numeric(gsub("-.*","",WYS)) - if(WYM > 1){ - WYP <- seq(1,(WYM - 1)) - }else{ - WYP <- 0 - } - - #Add leading zeros to month number January - September - WYP[nchar(WYP) == 1]<-paste0("0",WYP[nchar(WYP) == 1]) - #Combine to form a regex pattern that searches for any of the months at the - #END of a string ($) - WYP <- paste0(WYP,"$",collapse = "|") - #Get the year and month associated with the dates - WY <- format(dataDates,"%Y-%m") - - #Initialize water year by making it the calendar year + 1 - dataWY <- as.numeric(gsub("-.*","",WY)) + 1 - - #Search for all months from January to the start of the water year. These - #months need to be assigned previous water year number (current calendar - #year) - dataWY[grepl(WYP,WY)] <- dataWY[grepl(WYP,WY)] - 1 - - #Exception case occurs when water year is calendar year, in which case water - #year ends in calendar year such that calendar year = water year - if(WYS == "01-01"){ - dataWY <- as.numeric(format(dataDates,"%Y")) - } - #Store the years in the appropriate column in baseQ data frame - baseQ$Year <- dataWY - #Create an empty vector the length of the baseQ data frame - brk <- numeric(nrow(baseQ)) - #For each unique water year, calendar year, etc., run the hreg and store in - #brk - for (i in unique(dataWY)){ - baseQsubset <- baseQ[baseQ$Year == i,] - #Use the subset of the baseflow dataset to get baseline flow, brk - brki <- hreg(baseQsubset$baseQ,limit = 1) - brki <- brki * 1.1 - #Store results - brk[baseQ$Year == i] <- brki - } - }else if(baselineFlowOption == "Month"){ - #Make dates of the input timesteps: - dataDates <- as.Date(timeIn) - - #Create a vector of the months and years, essentially getting a vector of - #all months in the timeIn vector - monthYears <- format(dataDates,"%m-%Y") - - #Create an empty vector the length of the baseQ data frame - brk <- numeric(nrow(baseQ)) - for (i in unique(monthYears)){ - baseQsubset <- baseQ[monthYears == i,] - #Use the subset of the baseflow dataset to get baseline flow, brk - brki <- hreg(baseQsubset$baseQ,limit = 1) - brki <- brki * 1.1 - #Store results - brk[monthYears == i] <- brki - } - } - #Add baseline flow to baseQ - baseQ$baselineQ <- brk - - #Next step is to isolate storms. This can be accomplished by taking a minimum - #and the next point to fall below baseline flow, brk. Each storm is checked to - #ensure a maximum above some reference level, here 1.5*brk. This eliminates - #small storms with little or no peak or rises in baseflow. Only minimums below - #the brk value are considered as these are storms that occur at baseflow and - #fully rise/recede. - - #Get the times associated with minimums that are below baseline flow brk. - #First, join in the baseline flow for the timeperiod: - mins <- sqldf("SELECT mins.*, baseQ.baselineQ - FROM mins - LEFT JOIN baseQ - ON mins.timestamp = baseQ.timestamp") - - if(allMinimaStorms){ - #Use all minima as potential storm start and stop points - x <- mins$timestamp - #Get the corresponding local maximums - y <- maxes$maxes - }else{ - #Get only the minima timestamps that lie below baseline flow - x <- mins$timestamp[mins$mins < mins$baselineQ] - #Get the corresponding local maximums - y <- maxes$maxes[mins$mins < mins$baselineQ] - } - - #A data frame to build with storms in each column - stormsep <- list() - #An ID column to store the storm number in baseQ - baseQ$stormID <- NA - - #Start evaluating each set of minima to evaluate if there is a qualifying - #storm event. If there is, store it with all relevant data - for (i in 1:(length(x) - 1)){ - # if(i==73){browser()} - endIndex <- 1 - #Initial guess at storm endpoints e.g. two local minimums - storm <- c(x[i], x[i + endIndex]) - #initial stormflows and the times associated with those flows: - stormflow <- inflow[timeIn >= storm[1] & timeIn <= storm[2]] - - #When using allMinimaStorms, some minima may be combined. Skip loops that - #feature combined storms. So, if at frist storm and nextStorm are January - #1st - Jan 10 and Jan 10 - Jan 12, but these are combined into one (Jan 1 - - #Jan 12), then we can skip the loop of Jan 10 - Jan 12. First, make sure - #there is a storm in stormsep and that this is not the first loop i.e. i != - #1 - if(i > 1 && length(stormsep) > 0){ - #Get the previous storm - prevStorm <- stormsep[[length(stormsep)]] - #Remove the NAs so we only have the timestamps associated with the current - #storm - prevStorm <- prevStorm[!is.na(prevStorm$flow),] - #If the end point of the storm is in the previous storm, skip this - #iteration. This storm has already been accounted for. - if(storm[2] %in% prevStorm$timestamp){ - #Skip to next iteration of for loop: - next - } - } - - #If the second minimum flow is practically equal to the next local maximum, - #combine with the next storm as they are likely of the same storm (of - #course, this assumes that the timestamps match up which they should in the - #current set-up regardless of allMinimaStorms). But only do this if - #allMinimaStorms is TRUE since otherwise baselineflow is the cut-off. - if(allMinimaStorms & !is.na(x[i + 1 + endIndex])){ - #Initial guess at storm endpoints e.g. two local minimums - nextStorm <- c(x[i + 1], x[i + 1 + endIndex]) - #initial stormflows and the times associated with those flows: - nextStormflow <- inflow[timeIn >= nextStorm[1] & timeIn <= nextStorm[2]] - #What is the maximum of this storm event? - nextMaxStormFlow <- max(nextStormflow) - - while(!is.na(nextMaxStormFlow) & - stormflow[length(stormflow)] >= 0.8 * nextMaxStormFlow){ - endIndex <- endIndex + 1 - #Initial guess at storm endpoints e.g. two local minimums - storm <- c(x[i], x[i + endIndex]) - #initial stormflows and the times associated with those flows: - stormflow <- inflow[timeIn >= x[i] & timeIn <= x[i + endIndex]] - - if(!is.na(x[i + 1 + endIndex])){ - #Initial guess at storm endpoints e.g. two local minimums - nextStorm <- c(x[i + endIndex], x[i + 1 + endIndex]) - #initial stormflows and the times associated with those flows: - nextStormflow <- inflow[timeIn >= nextStorm[1] & timeIn <= nextStorm[2]] - #What is the maximum of this storm event? - nextMaxStormFlow <- max(nextStormflow) - }else{ - nextMaxStormFlow <- NA - } - } - } - - #Get the times associated with the storm - stormtimes <- timeIn[timeIn >= x[i] & timeIn <= x[i + endIndex]] - - #When does the maximum flow associated with this storm occur? - maxtime <- stormtimes[stormflow == max(stormflow)][1] - - #If there is a point at baseflow before next minimum, use it instead to - #prevent over elongated tails. We just need to ensure it takes place before - #the next minima, is over brk, and occurs after maxtime - endAlt <- (baseQ$timestamp[baseQ$flow < baseQ$baselineQ & - baseQ$timestamp > x[i] & - baseQ$timestamp < x[i + endIndex] & - baseQ$timestamp > maxtime])[1] - #If an alternate endpoint for the storm was found, redefine the storm: - if (!is.na(endAlt)){ - storm <- c(x[i],endAlt) - stormflow <- inflow[timeIn >= x[i] & timeIn <= endAlt] - stormtimes <- timeIn[timeIn >= x[i] & timeIn <= endAlt] - } - #data frame of storm data - stormdat <- data.frame( - timestamp = stormtimes, - baseQ = baseQ$baseQ[baseQ$timestamp >= storm[1] & baseQ$timestamp <= storm[2]], - flow = stormflow, - baselineQ = baseQ$baselineQ[baseQ$timestamp >= storm[1] & baseQ$timestamp <= storm[2]] - ) - - #data frame of whole time series - store <- data.frame(timestamp = baseQ$timestamp, flow = NA,baseflow = NA) - #Fills in only flow data during storm, leaving rest as NA - store$flow[store$timestamp >= storm[1] & - store$timestamp <= storm[2]] <- stormdat$flow - store$baseflow[store$timestamp >= storm[1] & - store$timestamp <= storm[2]] <- stormdat$baseQ - store$baselineflow[store$timestamp >= storm[1] & - store$timestamp <= storm[2]] <- stormdat$baselineQ - - #If maximum exceeds limit, add it to the stormsep list: - if(any(store$flow > (2.0 * store$baselineflow),na.rm = TRUE)){ - #Set the storm number in baseQ. In case of overlapping end/start points, - #store both IDs. - startID <- baseQ$stormID[baseQ$timestamp == storm[1]] - if(!is.na(startID)){ - #If the stormID in baseQ is already entered for the start date, it - #overlaps with a previous storm since both use that local minima. Store - #both IDs. - baseQ$stormID[baseQ$timestamp == storm[1]] <- paste0(startID,",",length(stormsep) + 1) - baseQ$stormID[baseQ$timestamp > storm[1] & baseQ$timestamp <= storm[2]] <- length(stormsep) + 1 - }else{ - #If there is no overlap, enter the storm ID for all storm timestamps - baseQ$stormID[baseQ$timestamp >= storm[1] & baseQ$timestamp <= storm[2]] <- length(stormsep) + 1 - } - - stormsep[[length(stormsep) + 1]] <- store - } - } - - #Now plot each storm and fit exponential curves to rising and falling limbs - #Store coefficients and statistics for each curve into a data frame, looking - #at shape of curve and the adjusted R square values. Store each storm as a PNG - #graph in the designated area. Need to prevent errors from zero flow. Added - #0.0001 to all flow values. This WILL RESULT IN BIAS - ext <- ".png" - #Empty data frame to store statistics - transients <- data.frame(startDate=character(length(stormsep)), - endDate = NA, maxDate = NA, - rising = NA, RsqR = NA, falling = NA, RsqF = NA, - durAll = NA,durF = NA,durR = NA, - volumeTotalMG = NA, - volumeAboveBaseQMG = NA, - volumeAboveBaselineQMG = NA, - volumeTotalMG_rise = NA, - volumeAboveBaseQMG_rise = NA, - volumeAboveBaselineQMG_rise = NA, - volumeTotalMG_fall = NA, - volumeAboveBaseQMG_fall = NA, - volumeAboveBaselineQMG_fall = NA) - for (i in 1:length(stormsep)){ - #Find the storm of interest - storm <- stormsep[[i]] - #remove nas: - storm <- storm[!is.na(storm[,2]),] - #Look for where the max is - maxtime <- storm[storm[,2] == max(storm[,2]),1][1] - - #Store the start and end time of the storm - transients$startDate[i] <- format(storm$timestamp[1],"%Y-%m-%d") - transients$endDate[i] <- format(storm$timestamp[nrow(storm)],"%Y-%m-%d") - transients$maxDate[i] <- format(storm[storm[,2] == max(storm[,2]),1][1],"%Y-%m-%d") - - #Separate rising and falling limbs based on maxtime e.g. the rising limb is - #all values leading up to maxtime - rising <- storm[storm[,1] <= maxtime,] - falling <- storm[storm[,1] >= maxtime,] - - #What is the volume of the storm streamflow i.e. total volume? First, get - #the difference in timestamps throughout the storm from one time to the - #next: - timeDiff <- difftime(storm$timestamp[2:nrow(storm)], storm$timestamp[1:(nrow(storm) - 1)], - units = "secs") - timeDiff <- as.numeric(timeDiff) - #Repeat for rising and falling limbs only: - #Rising: - timeDiff_rise <- difftime(rising$timestamp[2:nrow(rising)], rising$timestamp[1:(nrow(rising) - 1)], - units = "secs") - timeDiff_rise <- as.numeric(timeDiff_rise) - #Falling: - timeDiff_fall <- difftime(falling$timestamp[2:nrow(falling)], falling$timestamp[1:(nrow(falling) - 1)], - units = "secs") - timeDiff_fall <- as.numeric(timeDiff_fall) - #Using a trapezoidal approach, get the area of each trapezoid to estimate - #volume of the storm. Use three approaches. Total storm volume, volume above - #baseflow, volume above baseline flow - #Total storm flow: - trapzArea <- function(timeDiff,stormFlow){ - out <- timeDiff * ((stormFlow[1:(length(stormFlow) - 1)] + stormFlow[2:length(stormFlow)]) / 2) - } - trapz_total <- trapzArea(timeDiff,storm$flow) - #Only flow above baseflow: - trapz_abovebaseQ <- trapzArea(timeDiff,(storm$flow - storm$baseflow)) - #Only flow above baseline flow. THIS CAN BE NEGATIVE AND THEREFORE - #UNRELIABLE?: - trapz_abovebaselineQ <- trapzArea(timeDiff,(storm$flow - storm$baselineflow)) - - #Rising/falling storm flow: - trapz_total_rise <- trapzArea(timeDiff_rise,rising$flow) - trapz_total_fall <- trapzArea(timeDiff_fall,falling$flow) - #Only flow above baseflow: - trapz_abovebaseQ_rise <- trapzArea(timeDiff_rise,(rising$flow - rising$baseflow)) - trapz_abovebaseQ_fall <- trapzArea(timeDiff_fall,(falling$flow - falling$baseflow)) - #Only flow above baseline flow. THIS CAN BE NEGATIVE AND THEREFORE - #UNRELIABLE?: - trapz_abovebaselineQ_rise <- trapzArea(timeDiff_rise,(rising$flow - rising$baselineflow)) - trapz_abovebaselineQ_fall <- trapzArea(timeDiff_fall,(falling$flow - falling$baselineflow)) - - #Total volume is the sum of area of the trapezoids found above converted to - #MG from CF (1 CF * 12^3 in^3/cf * 231 gal/in^3 * 1 MG/1000000 gal): - volume_total <- sum(trapz_total) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaseQ <- sum(trapz_abovebaseQ) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaselineQ <- sum(trapz_abovebaselineQ) * 12 * 12 * 12 / 231 / 1000000 - #Rising Limb - volume_total_rise <- sum(trapz_total_rise) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaseQ_rise <- sum(trapz_abovebaseQ_rise) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaselineQ_rise <- sum(trapz_abovebaselineQ_rise) * 12 * 12 * 12 / 231 / 1000000 - #Falling limb: - volume_total_fall <- sum(trapz_total_fall) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaseQ_fall <- sum(trapz_abovebaseQ_fall) * 12 * 12 * 12 / 231 / 1000000 - volume_abovebaselineQ_fall <- sum(trapz_abovebaselineQ_fall) * 12 * 12 * 12 / 231 / 1000000 - #Store results: - transients$volumeTotalMG[i] <- volume_total - transients$volumeAboveBaseQMG[i] <- volume_abovebaseQ - transients$volumeAboveBaselineQMG[i] <- volume_abovebaselineQ - transients$volumeTotalMG_rise[i] <- volume_total_rise - transients$volumeAboveBaseQMG_rise[i] <- volume_abovebaseQ_rise - transients$volumeAboveBaselineQMG_rise[i] <- volume_abovebaselineQ_rise - transients$volumeTotalMG_fall[i] <- volume_total_fall - transients$volumeAboveBaseQMG_fall[i] <- volume_abovebaseQ_fall - transients$volumeAboveBaselineQMG_fall[i] <- volume_abovebaselineQ_fall - - #Create an exponential regression for the rising limb to roughly fit the - #rising limb based on an "ideal" hydrograph - modelR <- lm(log(rising[,2] + 0.0001) ~ seq(1,length(rising[,1]))) - #Store exponential coefficient and adjusted r squared values - transients$rising[i] <- summary(modelR)$coefficients[2] - transients$RsqR[i] <- summary(modelR)$adj.r.squared - - #Create an exponential regression for the falling limb - modelF <- lm(log(falling[,2] + 0.0001) ~ seq(1,length(falling[,1]))) - transients$falling[i] <- summary(modelF)$coefficients[2] - transients$RsqF[i] <- summary(modelF)$adj.r.squared - - #Finds duration of the storm, rising and falling limbs combined - transients$durAll[i] <- length(storm$timestamp) - #Finds duration of the rising limb - transients$durF[i] <- length(rising$timestamp) - #Finds duration of the falling limb - transients$durR[i] <- length(falling$timestamp) - - #Plot the storm and the fitted rising and falling limbs and store them in - #designated path. Include the baseflow and and the baseline flow brk - if(plt == T){ - #Set plot output path and dimensions - png(paste0(path,"storm",i,ext), width=1820,height=760) - #Set plot margines - par(mar=c(5,6,2,4)) - #Plot the storm, making the labels a little thicker and the lines of the - #plot and labeling the axes - plot(storm[,1], storm[,2], type='l', - xlab='Date', ylab='Flow (cfs)', - lwd=2, cex.axis=2, cex.lab=2) - #Plot the fitted rising limb: - lines(storm[storm[,1] <= maxtime,1], - exp(fitted(modelR)), - col = 'darkblue',lwd = 2) - #Plot the fitted falling limb: - lines(storm[storm[,1] >= maxtime,1], - exp(fitted(modelF)), - col = 'darkred', lwd = 2) - #Plot the baseflow - lines(storm[,1], storm[,3], - col = "darkgreen", lwd = 2) - #Plot the baseline flow brk as a dashed line via lty = 3 - lines(storm[,1], storm[,4],lty = 3,lwd = 2) - #Put a small legend on the plot - legend("topleft",c("Flow","Baseflow","Rise Limb","Fall Limb","Baseline"), - col = c("black","darkgreen","darkblue","darkred","black"), - lty = c(1,1,1,1,3), - bty = "n") - #Close the plot PNG and output the file - dev.off() - } - } - #rm(maxtime,storm,rising,falling,modelR,modelF,i,ext,fxn_locations) - out <- list(Storms = stormsep,Stats = transients, - flowData = baseQ) - return(out) -} - -#Below function hreg (horizontal regression) will try to fit mulitple -#horizontal lines through subset of data involving every point below that line -#until best fit is found. This becomes baseline flow, brk -hreg <- function(x, limit = 1){ - #What is the numeric percentile of x that is of percent limit? - lim <- as.numeric(quantile(x,limit)) - #Give all of x below the percentile of limit: - x <- x[x <= lim] - #Get all percentiles of x from 0 to 100% by 0.1%: - lns <- as.numeric(quantile(x, seq(0,1,0.001))) - #Keep only values above 0 - lns <- lns[lns != 0] - #A vector for mean square error - mse <- numeric(length(lns)) - #For each values in lns, determine the mean square error if this is a - #horizontal regression of the data in x - for (i in 1:length(lns)){ - line <- lns[i] - mse[i] <- mean((x - line)^2) - } - #Return the percentile x that created the minimum least square error: - return(lns[which.min(mse)]) -} - -# pathOut <- paste0(path,"storm",i,ext) - - -# outTest <- stormSeparate(timeIn, inflow, -# plt = FALSE,path = paste0(getwd(),"/"), -# allMinimaStorms = TRUE, -# baselineFlowOption = "Month" -# ) -# length(outTest$Storms) diff --git a/HARP-2024-2025/storm_event_ratings.R b/HARP-2024-2025/storm_event_ratings.R deleted file mode 100644 index fd00f31a..00000000 --- a/HARP-2024-2025/storm_event_ratings.R +++ /dev/null @@ -1,39 +0,0 @@ -## Set working directory -setwd("C:/Users/natef/OneDrive - Virginia Tech/HARP") - -#get packages -library(ggplot2) -library(sqldf) -library(tidyr) - -#set varibales for stream gage id -gage_id <- "02026000" - - -#download stormeventflow and rating values -stormflow <- read.csv(paste("http://deq1.bse.vt.edu:81/met/stormVol_nldas2/flow/usgs_ws_",gage_id,"-stormevent-flow.csv",sep = "")) -ratings0 <- read.csv(paste("http://deq1.bse.vt.edu:81/met/nldas2/out/usgs_ws_",gage_id,"-nldas2-lm_simple-rating-ts.csv",sep = "")) - -#clean up ratings dates -ratings <- ratings0 %>% - separate(start_date, into = c("timestamp","start_time"), sep = " ") - -#sql join -# Perform the join -result <- sqldf("SELECT stormflow.timestamp, stormflow.stormID, ratings.rating - FROM stormflow - INNER JOIN ratings ON stormflow.timestamp = ratings.timestamp") -is.na(result$stormID) - -plot <- ggplot(data = result, - mapping = aes(x = !is.na(stormID), y = rating))+ - geom_boxplot()+ - xlab("Storm Event?")+ - ggtitle("Comparison of Ratings During Storm Events and Outside of Storm Events")+ - theme_bw()+ - ylim(c(0,1)) - -plot - - - diff --git a/HARP-2024-2025/using_basic_analytics.R b/HARP-2024-2025/using_basic_analytics.R deleted file mode 100644 index 84c8ad3e..00000000 --- a/HARP-2024-2025/using_basic_analytics.R +++ /dev/null @@ -1,57 +0,0 @@ -source("C:/Users/natef/OneDrive - Virginia Tech/HARP/Github/HARParchive/HARP-2024-2025/basic_analytics_for_vahydro.R") -library("IHA") -library("ggplot2") -library(sqldf) -# prism <- read.csv("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv") -# prism_summary <- summary_analytics(prism) -# -# -# daymet <- read.csv("http://deq1.bse.vt.edu:81/met/daymet/precip/usgs_ws_01613900-daymet-all.csv") -# daymet_summary <- summary_analytics(daymet) -# -# nldas2 <- read.csv("http://deq1.bse.vt.edu:81/met/nldas2/precip/usgs_ws_01613900-nldas2-all.csv") -# nldas2_summary <- summary_analytics(nldas2) - -#pull any value like this: - -# prism_l90_precip_in <- prism_summary$l90_precip_in -# daymet_l90_precip_in <- daymet_summary$l90_precip_in -# nldas2_l90_precip_in <- nldas2_summary$l90_precip_in - -#Or even without the summary: - -summary_analytics(prism)$l90_precip_in - - -#visualizing data: - -# visualizations <- function(prism.metric, daymet.metric, nldas2.metric, title = NULL){ -# ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), -# y = c(prism.metric, daymet.metric, nldas2.metric)))+ -# geom_bar(stat = "identity", fill = "lightblue3")+ -# labs(title = title, y = "precip (in)", x = "model")} -# -# visualizations(prism.metric = prism_summary$precip_annual_max_in, -# daymet.metric = daymet_summary$precip_annual_max_in, -# nldas2.metric = nldas2_summary$precip_annual_max_in, -# title = "yearly max precip") -# visualizations(prism.metric = prism_summary$precip_annual_min_in, -# daymet.metric = daymet_summary$precip_annual_min_in, -# nldas2.metric = nldas2_summary$precip_annual_min_in, -# title = "yearly min precip") - -visualizations <- function(gage_id, metric){ - ggplot(data = NULL, mapping = aes(x = c("prism","dayment","nldas2"), - y = c(summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/",gage_id,"-PRISM-all.csv")))[[metric]], - summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/daymet/precip/",gage_id,"-daymet-all.csv")))[[metric]], - summary_analytics(read.csv(paste0("http://deq1.bse.vt.edu:81/met/nldas2/precip/",gage_id,"-nldas2-all.csv")))[[metric]])))+ - geom_bar(stat = "identity", fill = "lightblue3")+ - labs(title = as.character(metric), y = "precip (in)", x = "model")} - -# perform basic bar plot visualizations for any of these 5 metrics for any gage -visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_max_in") -visualizations(gage_id = "usgs_ws_01613900", metric = "l90_precip_in") -visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_mean_in") -visualizations(gage_id = "usgs_ws_01613900", metric = "precip_annual_min_in") -visualizations(gage_id = "usgs_ws_01613900", metric = "precip_daily_max_in") - diff --git a/HARP-2024-2025/variable definitions.R b/HARP-2024-2025/variable definitions.R deleted file mode 100644 index d5e029e7..00000000 --- a/HARP-2024-2025/variable definitions.R +++ /dev/null @@ -1,6 +0,0 @@ -coverage_hydrocode <- "usgs_ws_01613900" -coverage_bundle <- "watershed" #type of coverage we are looking for -coverage_ftype <- "usgs_full_drainage" #feature type -model_version <- "cbp-6.1" -met_file <- "http://deq1.bse.vt.edu:81/met/stormVol_prism/precip/usgs_ws_01613900-PRISM-all.csv" -scenario_name <- "stormVol_prism" diff --git a/HARP-2024-2025/yearly_metrics.R b/HARP-2024-2025/yearly_metrics.R deleted file mode 100644 index f8716b53..00000000 --- a/HARP-2024-2025/yearly_metrics.R +++ /dev/null @@ -1,73 +0,0 @@ -#number of days with precip_in and longest duration storm per yr - -library(sqldf) - -#this function uses a series of SQL chunks to compute yearly data frames -#that include helpful QA metrics. -yearly_metrics <- function(df){ -# Create Daily summary to make all data daily -daily_summary <- - sqldf( - "SELECT yr, mo, da, obs_date, SUM(precip_in) AS precip_in - FROM df - GROUP BY yr, mo, da" - ) #include obs_date after SQL to data frame - -# Number of days with precip_in per yr -rainfall_hist <- ggplot(data = daily_summary, mapping = aes(x = obs_date, y = precip_in))+ - geom_histogram() -rainfall_days <- sqldf("SELECT yr, COUNT(*) - AS DaysWithRainfall - FROM daily_summary - WHERE precip_in > 0.01 - GROUP BY yr") - -# # Longest duration storm per yr -# this marks the storm groups by separating due to days with no rainfall -#the NWS states that >0.01 is "measurable precipitation". -#this changes our numbers by ~20 days for the Hogue creek gage -storm_events <- sqldf("WITH RainEvents AS ( - SELECT obs_date, precip_in, yr, - (SELECT SUM(precip_in <= 0.01) - FROM daily_summary rd2 - WHERE rd2.obs_date <= rd1.obs_date) - AS StormGroup - FROM daily_summary rd1), - StormDurations AS ( - SELECT yr, StormGroup, COUNT(*) - AS StormDuration - FROM RainEvents - WHERE precip_in > 0.01 - GROUP BY yr, StormGroup) - SELECT yr, MAX(StormDuration) AS LongestStorm - FROM StormDurations - GROUP BY yr") - -# RainEvents <- sqldf(" -# SELECT obs_date, precip_in, yr, -# (SELECT SUM(precip_in <= 0.01) -# FROM daily_summary AS rd2 -# WHERE rd2.obs_date <= rd1.obs_date) -# AS StormGroup -# FROM daily_summary AS rd1") - -# Printing results -#merge(rainfall_days) -return(merge(rainfall_days,storm_events)) -return(RainEvents) -} -## It seems like NLDAS has much higher results... - -yearly_metrics(prism) -yearly_metrics(nldas2) - - -daily_summary <- - sqldf( - "SELECT yr, mo, da, obs_date, SUM(precip_in) AS precip_in - FROM df - GROUP BY yr, mo, da" - ) - -rainfall_hist <- ggplot(data = daily_summary, mapping = aes(x = precip_in))+ - geom_histogram(binwidth = 0.01)