Chapter 3 Snow Data Example

In this assignment, I explored web scraping, different functions and iterations by using a data set from the Center for Snow and Avalanche Studies Website and read a table in. This table contains links to data I want and to programatically download for three sites. I don’t know much about these sites, but they contain incredibly rich snow, temperature, and precipitation data.

3.1 Reading an html

3.1.1 Extract CSV links from webpage

site_url <- 'https://snowstudies.org/archived-data/'

#Read the web url
webpage <- read_html(site_url)

#See if we can extract tables and get the data that way
tables <- webpage %>%
  html_nodes('table') %>%
  magrittr::extract2(3) %>%
  html_table(fill = TRUE)
#That didn't work, so let's try a different approach

#Extract only weblinks and then the URLs!
links <- webpage %>%
  html_nodes('a') %>%
  .[grepl('24hr',.)] %>%
  html_attr('href')

3.2 Data Download

3.2.1 Download data in a for loop

#Grab only the name of the file by splitting out on forward slashes
splits <- str_split_fixed(links,'/',8)

#Keep only the 8th column
dataset <- splits[,8] 

#generate a file list for where the data goes
file_names <- paste0('/Users/maddiebean21/Desktop/School/ESS580A7/bookdown_final/data/03-hw-snow/SASP_24hr.csv',dataset)

for(i in 1:3){
  download.file(links[i],destfile=file_names[i])
}

downloaded <- file.exists(file_names)

evaluate <- !all(downloaded)

3.2.2 Download data in a map

#Map version of the same for loop (downloading 3 files)
if(evaluate == T){
  map2(links[1:3],file_names[1:3],download.file)
}else{print('data already downloaded')}

## [[1]]
## [1] 0
## 
## [[2]]
## [1] 0
## 
## [[3]]
## [1] 0

3.3 Data read-in

3.3.1 Read in just the snow data as a loop

#Pattern matching to only keep certain files
snow_files <- file_names %>%
  .[!grepl('SG_24',.)] %>%
  .[!grepl('PTSP',.)]

empty_data <- list()

snow_data <- for(i in 1:length(snow_files)){
  empty_data[[i]] <- read_csv(snow_files[i]) %>%
    select(Year,DOY,Sno_Height_M)
  }

## Rows: 6211 Columns: 52

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (52): ArrayID, Year, DOY, Hour, LoAir_Min_C, LoAir_Min_Time, LoAir_Max_C...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 6575 Columns: 48

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (48): ArrayID, Year, DOY, Hour, LoAir_Min_C, LoAir_Min_Time, LoAir_Max_C...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

snow_data_full <- do.call('rbind',empty_data)

summary(snow_data_full)

##       Year           DOY         Sno_Height_M   
##  Min.   :2003   Min.   :  1.0   Min.   :-3.523  
##  1st Qu.:2008   1st Qu.: 92.0   1st Qu.: 0.350  
##  Median :2012   Median :183.0   Median : 0.978  
##  Mean   :2012   Mean   :183.1   Mean   : 0.981  
##  3rd Qu.:2016   3rd Qu.:274.0   3rd Qu.: 1.520  
##  Max.   :2021   Max.   :366.0   Max.   : 2.905  
##                                 NA's   :4554

3.3.2 Read in the data as a map function

#making the data as a map function
our_snow_reader <- function(file){
  name = str_split_fixed(file,'/',2)[,2] %>%
    gsub('_24hr.csv','',.)
  df <- read_csv(file) %>%
    select(Year,DOY,Sno_Height_M) %>%
    mutate(site = name)
}

#creating an object with the functions
snow_data_full <- map_dfr(snow_files,our_snow_reader)

## Rows: 6211 Columns: 52

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (52): ArrayID, Year, DOY, Hour, LoAir_Min_C, LoAir_Min_Time, LoAir_Max_C...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 6575 Columns: 48

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (48): ArrayID, Year, DOY, Hour, LoAir_Min_C, LoAir_Min_Time, LoAir_Max_C...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

summary(snow_data_full)

##       Year           DOY         Sno_Height_M        site          
##  Min.   :2003   Min.   :  1.0   Min.   :-3.523   Length:12786      
##  1st Qu.:2008   1st Qu.: 92.0   1st Qu.: 0.350   Class :character  
##  Median :2012   Median :183.0   Median : 0.978   Mode  :character  
##  Mean   :2012   Mean   :183.1   Mean   : 0.981                     
##  3rd Qu.:2016   3rd Qu.:274.0   3rd Qu.: 1.520                     
##  Max.   :2021   Max.   :366.0   Max.   : 2.905                     
##                                 NA's   :4554

3.3.3 Plot snow data

#making an object for the yearly snow data points
snow_yearly <- snow_data_full %>%
  group_by(Year,site) %>%
  summarize(mean_height = mean(Sno_Height_M,na.rm=T))

## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.

#plotting the yearly snow data
ggplot(snow_yearly,aes(x=Year,y=mean_height,color=site)) + 
  geom_point() +
  ggthemes::theme_few() + 
  ggthemes::scale_color_few()+
  labs(x="Mean Height", y="Year", title="Yearly Snow Data")

## Warning: Removed 2 rows containing missing values (geom_point).

3.4 Extracting the meteorological data URLs

I used the rvest package to get the URLs for the SASP forcing and SBSP_forcing meteorological datasets.

#creating values for the meteorological data URLs
links_hw <- webpage %>%
  html_nodes('a') %>%
  .[grepl('forcing',.)] %>%
  html_attr('href')
links_hw

## [1] "https://snowstudies.org/wp-content/uploads/2022/02/SBB_SASP_Forcing_Data.txt"
## [2] "https://snowstudies.org/wp-content/uploads/2022/02/SBB_SBSP_Forcing_Data.txt"

3.5 Download the meteorological data.

Next, I used the download_file and str_split_fixed commands, along with a for loop, to download the data and saved it in our data folder.

# grab only the name of the file by splitting out on forward slashes
splits_hw <- str_split_fixed(links_hw, '/', 8)

forcingdataset <- splits_hw[,8] %>% 
  gsub('.txt','',.)

file_names_hw <- paste0('/Users/maddiebean21/Desktop/School/ESS580A7/bookdown_final/data/03-hw-snow/SBB_SASP_Forcing_Data', forcingdataset)

# creating a for loop
for(i in 1:length(file_names_hw)){
  download.file(links_hw[i], destfile=file_names_hw[i])
}

file_names_hw

## [1] "/Users/maddiebean21/Desktop/School/ESS580A7/bookdown_final/data/03-hw-snow/SBB_SASP_Forcing_DataSBB_SASP_Forcing_Data"
## [2] "/Users/maddiebean21/Desktop/School/ESS580A7/bookdown_final/data/03-hw-snow/SBB_SASP_Forcing_DataSBB_SBSP_Forcing_Data"

3.6 Custom function writing

I wrote a custom function in order to read in the data and append a site column to the data.

# this code grabs the variable names from the metadata pdf file
library(pdftools)

## Using poppler version 20.12.1

q3_headers <- pdf_text('https://snowstudies.org/wp-content/uploads/2022/02/Serially-Complete-Metadata-text08.pdf') %>%
  readr::read_lines(.) %>%
  trimws(.) %>%
  str_split_fixed(.,'\\.',2) %>%
  .[,2] %>%
  .[1:26] %>%
  str_trim(side = "left")
q3_headers

##  [1] "year"                                                       
##  [2] "month"                                                      
##  [3] "day"                                                        
##  [4] "hour"                                                       
##  [5] "minute"                                                     
##  [6] "second"                                                     
##  [7] "precip [kg m-2 s-1]"                                        
##  [8] "sw down [W m-2]"                                            
##  [9] "lw down [W m-2]"                                            
## [10] "air temp [K]"                                               
## [11] "windspeed [m s-1]"                                          
## [12] "relative humidity [%]"                                      
## [13] "pressure [Pa]"                                              
## [14] "specific humidity [g g-1]"                                  
## [15] "calculated dewpoint temperature [K]"                        
## [16] "precip, WMO-corrected [kg m-2 s-1]"                         
## [17] "air temp, corrected with Kent et al. (1993) [K]"            
## [18] "air temp, corrected with Anderson and Baumgartner (1998)[K]"
## [19] "air temp, corrected with Nakamura and Mahrt (2005) [K]"     
## [20] "air temp, corrected with Huwald et al. (2009) [K]"          
## [21] "qc code precip"                                             
## [22] "qc code sw down"                                            
## [23] "qc code lw down"                                            
## [24] "qc code air temp"                                           
## [25] "qc code wind speed"                                         
## [26] "qc code relhum"

#creating a function
q3_reader <- function(file){
  name = str_split_fixed(file, '/', 2)[,2] 
  name2 = str_split_fixed(file, '_', 4)[,2]
  q3test = read.delim(file, header = FALSE, sep ="", col.names = q3_headers, skip = 4) %>%
  select(1:14) %>%
  mutate(site=name2)
}

3.7 Summary of meteorlogical files

I used the map function to read in both meteorological files and then displayed a summary of my tibble.

#reading in the forcing data 
forcing_data_full <- map_dfr(file_names_hw, q3_reader)

summary(forcing_data_full)

##       year          month             day             hour           minute 
##  Min.   :2003   Min.   : 1.000   Min.   : 1.00   Min.   : 0.00   Min.   :0  
##  1st Qu.:2005   1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 5.75   1st Qu.:0  
##  Median :2007   Median : 6.000   Median :16.00   Median :11.50   Median :0  
##  Mean   :2007   Mean   : 6.472   Mean   :15.76   Mean   :11.50   Mean   :0  
##  3rd Qu.:2009   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:17.25   3rd Qu.:0  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :23.00   Max.   :0  
##      second  precip..kg.m.2.s.1. sw.down..W.m.2.     lw.down..W.m.2.  
##  Min.   :0   Min.   :0.000e+00   Min.   :-9999.000   Min.   :-9999.0  
##  1st Qu.:0   1st Qu.:0.000e+00   1st Qu.:   -3.510   1st Qu.:  173.4  
##  Median :0   Median :0.000e+00   Median :   -0.344   Median :  231.4  
##  Mean   :0   Mean   :3.838e-05   Mean   :-1351.008   Mean   :-1325.7  
##  3rd Qu.:0   3rd Qu.:0.000e+00   3rd Qu.:  294.900   3rd Qu.:  272.2  
##  Max.   :0   Max.   :6.111e-03   Max.   : 1341.000   Max.   :  365.8  
##   air.temp..K.   windspeed..m.s.1.   relative.humidity.... pressure..Pa.  
##  Min.   :242.1   Min.   :-9999.000   Min.   :  0.011       Min.   :63931  
##  1st Qu.:265.8   1st Qu.:    0.852   1st Qu.: 37.580       1st Qu.:63931  
##  Median :272.6   Median :    1.548   Median : 59.910       Median :65397  
##  Mean   :272.6   Mean   : -790.054   Mean   : 58.891       Mean   :65397  
##  3rd Qu.:279.7   3rd Qu.:    3.087   3rd Qu.: 81.600       3rd Qu.:66863  
##  Max.   :295.8   Max.   :  317.300   Max.   :324.800       Max.   :66863  
##  specific.humidity..g.g.1.     site          
##  Min.   :0.000000          Length:138336     
##  1st Qu.:0.001744          Class :character  
##  Median :0.002838          Mode  :character  
##  Mean   :0.003372                            
##  3rd Qu.:0.004508                            
##  Max.   :0.014780

##Average yearly temperature

I made a line plot of mean temp by year by site (using the air temp [K] variable). What is suspicious about this data is temperature variation at the beginning of the graph, especially since the temperature is in Kelvin. This eludes that temperature was most likely not collected in the earlier years.

#creating an object to plot, grabbing the mean temperature by year by site
q5_yearly <- forcing_data_full %>%
  group_by(year,site) %>%
  summarize(mean_temp_k = mean(air.temp..K.,na.rm=T))

## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

#plotting the mean temperature by year by site
ggplot(q5_yearly) + 
  geom_line(aes(x=year,y=mean_temp_k,color=site)) + 
  ggthemes::theme_few() + 
  ggthemes::scale_color_few()

What is suspicious about this data is temperature variation at the beginning of the graph, especially since the temperature is in Kelvin. This eludes that temperature was most likely not collected in the earlier years.

3.8 Monthy average temperature plot

Here, i wrote a function in order to make line plots of monthly average temperature at each site for a given year. I used a for loop to make these plots for 2005 to 2010. Both sites follow the same trend for each month, however, the SBSP site is never warmer than SASP site. There are times where are almost the same temperature, but SBSP never exceeds SASP.

#creating a function for the monthly average temperature
forcingmonthyear <- function(forcing_data_full, year){
  monthlytemp<-forcing_data_full %>%
    group_by(month,year,site) %>%
    summarize(monthly_air_temp = mean(air.temp..K.,na.rm=T)) %>%
    filter(yr == year)
  
  #plotting the function
  plots <- ggplot(monthlytemp, aes(x = month, y = monthly_air_temp, color = site)) +
    geom_line(size=2)+
    facet_wrap(~year)
    labs(title= monthlytemp$year,
         x = "Month",
         y = "Temperature (K)")
  
  print(plots)
}

years <- c(2005,2006,2007,2008,2009,2010)
  
#creating a for loop
for (yr in years) {
  forcingmonthyear(forcing_data_full, year)
}

## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.

## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.

## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.

## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.

## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.

3.9 Average daily precipitation

For this graph, I grouped the data by day by site to get the daily temperature.

# making an object for daily temperature
dailytemp<-forcing_data_full %>%
    group_by(day,site) %>%
    summarize(daily_air_temp = mean(air.temp..K.,na.rm=T))

## `summarise()` has grouped output by 'day'. You can override using the `.groups` argument.

#plotting bonus
ggplot(dailytemp, aes(x=day, y=daily_air_temp, color=site))+
  geom_line()+
  labs(x='Day of the Month', y='Daily Air Temperature', title = "Daily Temperature")