library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
Dataset Daily Summaries Order Start Date 1941-01-01 00:00 Order End Date 2022-03-14 23:59 Output Format Custom GHCN-Daily CSV Data Types PRCP, TMAX, TMIN Custom Flag(s) Station Name Units Standard Stations/Locations SAN DIEGO INTERNATIONAL AIRPORT, CA US (Station ID: GHCND:USW00023188)
San_Diego_Airport <- read_csv("SanDiego-2908058.csv", col_types=cols(DATE = col_character()))
glimpse(San_Diego_Airport)
## Rows: 29,655
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <chr> "1941-01-01", "1941-01-02", "1941-01-03", "1941-01-04", "1941-~
## $ PRCP <dbl> 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.00, 0.00, 0.73, 0.~
## $ TMAX <dbl> 68, 67, 65, 64, 62, 65, 70, 71, 68, 67, 65, 67, 63, 63, 65, 65~
## $ TMIN <dbl> 49, 47, 45, 46, 51, 52, 49, 48, 56, 56, 53, 48, 48, 50, 47, 49~
summary(San_Diego_Airport)
## STATION NAME DATE PRCP
## Length:29655 Length:29655 Length:29655 Min. :0.00000
## Class :character Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Mode :character Median :0.00000
## Mean :0.02684
## 3rd Qu.:0.00000
## Max. :2.70000
## NA's :1
## TMAX TMIN
## Min. : 46.00 Min. :29.00
## 1st Qu.: 66.00 1st Qu.:52.00
## Median : 70.00 Median :58.00
## Mean : 70.69 Mean :57.25
## 3rd Qu.: 75.00 3rd Qu.:63.00
## Max. :111.00 Max. :78.00
## NA's :2 NA's :2
The character column DATE contains dates in ISO-8601 format. Use as.date() to convert it and run glimpse again.
San_Diego_Airport$DATE = as.Date(San_Diego_Airport$DATE)
glimpse(San_Diego_Airport)
## Rows: 29,655
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <date> 1941-01-01, 1941-01-02, 1941-01-03, 1941-01-04, 1941-01-05, 1~
## $ PRCP <dbl> 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.00, 0.00, 0.73, 0.~
## $ TMAX <dbl> 68, 67, 65, 64, 62, 65, 70, 71, 68, 67, 65, 67, 63, 63, 65, 65~
## $ TMIN <dbl> 49, 47, 45, 46, 51, 52, 49, 48, 56, 56, 53, 48, 48, 50, 47, 49~
Do a summary() and check for annomalies
summary(San_Diego_Airport)
## STATION NAME DATE PRCP
## Length:29655 Length:29655 Min. :1941-01-01 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-04-22 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-08-09 Median :0.00000
## Mean :1981-08-08 Mean :0.02684
## 3rd Qu.:2001-11-25 3rd Qu.:0.00000
## Max. :2022-03-14 Max. :2.70000
## NA's :1
## TMAX TMIN
## Min. : 46.00 Min. :29.00
## 1st Qu.: 66.00 1st Qu.:52.00
## Median : 70.00 Median :58.00
## Mean : 70.69 Mean :57.25
## 3rd Qu.: 75.00 3rd Qu.:63.00
## Max. :111.00 Max. :78.00
## NA's :2 NA's :2
San_Diego_Airport %>%
filter(is.na(TMAX) | is.na(TMIN) | is.na(PRCP))
## # A tibble: 3 x 6
## STATION NAME DATE PRCP TMAX TMIN
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-02-23 0 64 NA
## 2 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-07-17 0 NA 65
## 3 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 2022-03-14 NA NA NA
San_Diego_Airport = San_Diego_Airport%>%
drop_na()
summary(San_Diego_Airport)
## STATION NAME DATE PRCP
## Length:29652 Length:29652 Min. :1941-01-01 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-04-23 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-08-09 Median :0.00000
## Mean :1981-08-09 Mean :0.02684
## 3rd Qu.:2001-11-25 3rd Qu.:0.00000
## Max. :2022-03-13 Max. :2.70000
## TMAX TMIN
## Min. : 46.00 Min. :29.00
## 1st Qu.: 66.00 1st Qu.:52.00
## Median : 70.00 Median :58.00
## Mean : 70.69 Mean :57.25
## 3rd Qu.: 75.00 3rd Qu.:63.00
## Max. :111.00 Max. :78.00
Get density with rug plots for TMAZ, TMIN, and PRCP
# TMAX density plot
San_Diego_Airport %>%
ggplot(aes(x = TMAX)) +
geom_density() +
geom_rug() +
ggtitle("TMAX")
moderate temperature and a shoulder with a secondary peak to the right of the primary peak.
# TMIN density plot
San_Diego_Airport %>%
ggplot(aes(x=TMIN)) +
geom_density() +
geom_rug() +
ggtitle("TMIN")
Moderate TMIN
# PRCP density plot
San_Diego_Airport %>%
ggplot(aes(x=PRCP)) +
geom_density() +
geom_rug() +
ggtitle("PRCP density plot")
We see most of the PRCP are within 0 to 2.
Load the package lubridate. Then use the functions year(), month() and day() to create the variables yr, mo, and dy. Make these variables factors. Use glimpse() and summary() to verify the results.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#add dplyr library aslo to work with columns
library(dplyr)
San_Diego_Airport = San_Diego_Airport %>%
mutate(year = factor(year(DATE)),
month = factor(month(DATE)),
day = factor(day(DATE)))
glimpse(San_Diego_Airport)
## Rows: 29,652
## Columns: 9
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <date> 1941-01-01, 1941-01-02, 1941-01-03, 1941-01-04, 1941-01-05, 1~
## $ PRCP <dbl> 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.00, 0.00, 0.73, 0.~
## $ TMAX <dbl> 68, 67, 65, 64, 62, 65, 70, 71, 68, 67, 65, 67, 63, 63, 65, 65~
## $ TMIN <dbl> 49, 47, 45, 46, 51, 52, 49, 48, 56, 56, 53, 48, 48, 50, 47, 49~
## $ year <fct> 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 19~
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
# check summary
summary(San_Diego_Airport)
## STATION NAME DATE PRCP
## Length:29652 Length:29652 Min. :1941-01-01 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-04-23 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-08-09 Median :0.00000
## Mean :1981-08-09 Mean :0.02684
## 3rd Qu.:2001-11-25 3rd Qu.:0.00000
## Max. :2022-03-13 Max. :2.70000
##
## TMAX TMIN year month
## Min. : 46.00 Min. :29.00 1944 : 366 1 : 2542
## 1st Qu.: 66.00 1st Qu.:52.00 1948 : 366 3 : 2524
## Median : 70.00 Median :58.00 1952 : 366 5 : 2511
## Mean : 70.69 Mean :57.25 1956 : 366 8 : 2511
## 3rd Qu.: 75.00 3rd Qu.:63.00 1960 : 366 10 : 2511
## Max. :111.00 Max. :78.00 1964 : 366 12 : 2511
## (Other):27456 (Other):14542
## day
## 1 : 975
## 2 : 975
## 3 : 975
## 4 : 975
## 5 : 975
## 9 : 975
## (Other):23802
# TMAX by month
San_Diego_Airport %>%
ggplot(aes(month, TMAX)) +
geom_point() +
ggtitle("TMAX by month")
# TMIN by Month. #size is used here
San_Diego_Airport %>%
ggplot(aes(month, TMIN)) +
geom_point(size=0.5) +
geom_smooth(color ="red") +
ggtitle("TMIN by month")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# PRCP by month. #kept alpha for transperance here
San_Diego_Airport %>%
ggplot(aes(month, PRCP)) +
geom_point(alpha = 0.3) +
ggtitle("PRCP by month")
September and October are two of the hottest months in San Diego.
The minimum temperatures are on December and January. The lowest is on January around 30 degree which is still not that cold as compared to other Northern US locations.
Seems like the precipitation is high in december and january reaching around the value of 2.7.
# Point graph of TMAX and TMIN. geom_jitter() is used here
San_Diego_Airport %>%
ggplot(aes(TMAX, TMIN)) +
geom_jitter(size =0.1) +
geom_smooth(color = "red") +
ggtitle("TMAX and TMIN")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Point graph of TMAX and PRCP
San_Diego_Airport %>%
ggplot(aes(TMAX, PRCP)) +
geom_point() +
geom_smooth(color="red")+
ggtitle("TMAX and PRCP")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Point graph of TMIN and PRCP
San_Diego_Airport %>%
ggplot(aes(TMIN, PRCP)) +
geom_point() +
geom_smooth(color="red")+
ggtitle("TMAX and PRCP")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#line graph for TMAX, TMIN, and PRCP
ggplot(San_Diego_Airport, aes(DATE, TMAX)) +
geom_line(size=0.1) +
ggtitle("DATE Vs TMAX line graph")
ggplot(San_Diego_Airport, aes(DATE, TMIN)) +
geom_line(size=0.1, linetype = "dotted") +
geom_point(size = 0.2) +
ggtitle("DATE Vs TMIN line graph with dotted points")
ggplot(San_Diego_Airport, aes(DATE, PRCP)) +
geom_line(size=0.1, linetype = "dotted") +
geom_point(size = 0.2) +
ggtitle("DATE Vs PRCP line graph with dotted points")
#default histogram binwidth
ggplot(San_Diego_Airport, aes(x = TMAX)) +
geom_histogram() +
ggtitle("TMAX histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#using bin width = 15 and see the difference.
ggplot(San_Diego_Airport, aes(x = TMAX)) +
geom_histogram(binS = 15) +
ggtitle("TAX max histogram with binwidth 15")
## Warning: Ignoring unknown parameters: binS
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(San_Diego_Airport, aes(month, TMIN)) +
geom_boxplot() +
facet_wrap(~month) +
ggtitle("TMIN box plot for each month")
## Barplot of PRCP
ggplot(San_Diego_Airport, aes(x = TMAX)) +
geom_bar() +
ggtitle("Bar plot for TMAX")
# it is done by looking at a time-series of z-scores instead of raw data. If the recent recent numbers are below -3, something was going on.
# histogram of the raw data is done.
rain_5_8 = San_Diego_Airport %>%
select(DATE, PRCP) %>%
mutate(mo = month(DATE),
yr = year(DATE)) %>%
filter(mo >= 5 & mo <= 8) %>%
group_by(yr) %>%
summarize(rain = sum(PRCP)) %>%
ungroup() %>%
mutate(mrain = mean(rain),
sdrain = sd(rain),
z_score = (rain - mrain)/sdrain)
head(rain_5_8)
## # A tibble: 6 x 5
## yr rain mrain sdrain z_score
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1941 0.09 0.372 0.672 -0.419
## 2 1942 0.12 0.372 0.672 -0.375
## 3 1943 0.03 0.372 0.672 -0.509
## 4 1944 0.32 0.372 0.672 -0.0770
## 5 1945 1.06 0.372 0.672 1.02
## 6 1946 0.01 0.372 0.672 -0.538
rain_5_8 %>%
ggplot(aes(x = yr, y = z_score)) +
geom_point() +
geom_line(size = .1)
The lowest z_socre is around -0.5 for PRCP. so there is no indication of drought.
The histogram is as follows
rain_5_8 %>%
ggplot(aes(x = rain)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
perfect day is day with maximum temperature of 75 with no rain.
# code to get the count of perfect days in a year.
San_Diego_Airport %>%
filter(TMAX == 75 & PRCP == 0) %>%
group_by(year) %>%
summarize(count = n()) %>%
filter(count > 0) %>%
ungroup() %>%
ggplot(aes(x = factor(count))) +
geom_bar()
The most common year count is 18. there have been years with only two.
San_Diego_Airport %>%
filter(TMAX == 75 & PRCP == 0) %>%
ggplot(aes(x = month)) +
geom_bar() +
ggtitle("bar graph to see Which month have perfect days")
July seems to have more perfect days. August comes the second.
To summarize the annual pattern, we need to put all of the data into a single year. we could think of this as making a prediction for an arbitrary future year. We need to do this so we can have a date variable that doesn’t contain different years. We’ll use 2024 since it’s a leap year. We’ll call this new dataframe cal24. we will use make_date() from the lubridate package and summary() on cal24 to verify that you succeeded.
cal24 = San_Diego_Airport %>%
mutate(DATE = make_date(2024, month, day),
year = year(DATE))
summary(cal24)
## STATION NAME DATE PRCP
## Length:29652 Length:29652 Min. :2024-01-01 Min. :0.00000
## Class :character Class :character 1st Qu.:2024-04-01 1st Qu.:0.00000
## Mode :character Mode :character Median :2024-07-01 Median :0.00000
## Mean :2024-07-01 Mean :0.02684
## 3rd Qu.:2024-10-01 3rd Qu.:0.00000
## Max. :2024-12-31 Max. :2.70000
##
## TMAX TMIN year month
## Min. : 46.00 Min. :29.00 Min. :2024 1 : 2542
## 1st Qu.: 66.00 1st Qu.:52.00 1st Qu.:2024 3 : 2524
## Median : 70.00 Median :58.00 Median :2024 5 : 2511
## Mean : 70.69 Mean :57.25 Mean :2024 8 : 2511
## 3rd Qu.: 75.00 3rd Qu.:63.00 3rd Qu.:2024 10 : 2511
## Max. :111.00 Max. :78.00 Max. :2024 12 : 2511
## (Other):14542
## day
## 1 : 975
## 2 : 975
## 3 : 975
## 4 : 975
## 5 : 975
## 9 : 975
## (Other):23802
size and aplpha are 0.2 to deal with the overplotting in geom_point()
cal24 %>%
ggplot(aes(x = DATE, y = TMAX)) +
geom_point(aplpha = 0.2, size = 0.2) +
geom_smooth(color = "red")
## Warning: Ignoring unknown parameters: aplpha
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
it seems to take longer going up in TMAX for San Diago.
Add a geom_point() with red points to show calendar 2021. Create a dataframe cal21 by filtering for yr = 2021. Use make_date() to change the yr to 2024. Add a second geom_point() with this dataframe as the data argument. Set the size parameter to .5 in the second geom_point().
cal21 = San_Diego_Airport %>%
filter(year == 2021) %>%
mutate(DATE = make_date(2024, month, day))
cal24 %>%
ggplot(aes(x=DATE, y = TMAX)) +
geom_point(size = 0.2, alpha = 0.2) +
geom_point(data = cal21, aes(x = DATE, y = TMAX),
color = "red" , size = 0.5)
#scatter plot for mean of TMAX for each DATE for predicting in 2024
cal24 %>%
group_by(DATE) %>%
summarize(MTMAX = mean(TMAX)) %>%
ungroup() %>%
ggplot(aes(x =DATE, y = MTMAX))+
geom_point(size = 0.5) +
ggtitle("predicting mean of TMAX for each DATE value for predicting in 2024")
#scatter plot for mean of TMIN for each DATE for predicting in 2024
cal24 %>%
group_by(DATE) %>%
summarize(MTMIN = mean(TMIN)) %>%
ungroup() %>%
ggplot(aes(x =DATE, y = MTMIN))+
geom_point(size = 0.5) +
ggtitle("predicting mean of TMIN for each DATE value for predicting in 2024")
#density plot of mean of TMAX for predicting 2024
cal24 %>%
group_by(DATE)%>%
summarize(MTMAX = mean(TMAX)) %>%
ungroup() %>%
ggplot(aes(x = MTMAX))+
geom_density(adjust = 0.2) +
ggtitle("density plot of mean of TMAX for predicting in 2024")
#density plot of mean of TMAX for predicting in 2024
cal24 %>%
group_by(DATE)%>%
summarize(MTMIN = mean(TMIN)) %>%
ungroup() %>%
ggplot(aes(x = MTMIN))+
geom_density(adjust = 0.2) +
ggtitle("density plot of mean of TMIN for predicting in 2024")
standard deviation of TMAX, TMIN, and PRCP based on cal24 for 2024
#standard deviation of TMAX for 2024
cal24 %>%
group_by(DATE) %>%
summarize(sd_TMAX = sd(TMAX)) %>%
ungroup() %>%
ggplot(aes(DATE, sd_TMAX)) +
geom_point() +
geom_smooth(color ="red") +
ggtitle("standard deviation of TMAX in 2024")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#standard deviation of TMIN for 2024
cal24 %>%
group_by(DATE) %>%
summarize(sd_TMIN = sd(TMIN)) %>%
ungroup() %>%
ggplot(aes(DATE, sd_TMIN)) +
geom_point() +
ggtitle("standard deviation of TMIN in 2024")
#standard deviation of PRCP for 2024
cal24 %>%
group_by(DATE) %>%
summarize(sd_PRCP = sd(PRCP)) %>%
ungroup() %>%
ggplot(aes(DATE, sd_PRCP)) +
geom_point() +
ggtitle("standard deviation of PRCP in 2024")
cal24 %>%
mutate(diff = TMAX - TMIN) %>%
group_by(DATE) %>%
summarize(diff = mean(diff)) %>%
ungroup() %>%
ggplot(aes(x = DATE, y = diff)) +
geom_point() +
ggtitle("predicting Difference between TMAX and TMIN for 2024")
The difference seems to be higher on the december and january.
There are two possible ways to look at precipitation. We could use either the mean value of precipitation for a date, or the probability of precipitation on that date.
#we will use mean value of precipitaiton first. we'll also use plotly for interactive plot for Rpub and HTML
#import library plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#use plotly for the mean PRCP
g1 = cal24 %>%
group_by(DATE) %>%
summarize(mean_precip = mean(PRCP)) %>%
ungroup() %>%
ggplot(aes(x = DATE, y = mean_precip)) +
geom_point() +
ggtitle("predicting mean value of precipitation for 2024")
ggplotly(g1) # For Rpubs or other html
# Now do the probability of precipitation for 2024
g2 <- cal24 %>%
group_by(DATE) %>%
summarize(prob_precip = mean(PRCP > 0)) %>%
ungroup() %>%
ggplot(aes(x = DATE, y = prob_precip)) +
geom_point() +
ggtitle("predicting probability of PRCP in 2024")
ggplotly(g2)
Both of these graphs look similar with slight variations.
A graph showing loss curves for precipitation and TMAX. Since these two variables have such different values, ywe will have to create z-scores to make them visually compatible. Call the z-score variables n_TMAX and n_PRCP.
cal24 %>%
mutate(n_TMAX = (TMAX - mean(TMAX))/sd(TMAX),
n_PRCP = (PRCP - mean(PRCP))/sd(PRCP)) %>%
ggplot(aes(x = DATE)) +
geom_smooth(aes(y = n_TMAX), color = "red") +
geom_smooth(aes(y = n_PRCP), color = "blue") +
ggtitle("predicting loss curve for PRCP and TMAX for 2024")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
The turning points in the summer are essentially the same.