library(tidyverse)
library(knitr)
library(broom)
library(ggthemes)
temp <- read_csv('DATA_605_Assignments/Data605Week11/GlobalTemperatures.csv')
co2 <- read_csv('DATA_605_Assignments/Data605Week11/co2.csv')
temp %>% head() %>% kable()
| dt | LandAverageTemperature | LandAverageTemperatureUncertainty | LandMaxTemperature | LandMaxTemperatureUncertainty | LandMinTemperature | LandMinTemperatureUncertainty | LandAndOceanAverageTemperature | LandAndOceanAverageTemperatureUncertainty |
|---|---|---|---|---|---|---|---|---|
| 1750-01-01 | 3.034 | 3.574 | NA | NA | NA | NA | NA | NA |
| 1750-02-01 | 3.083 | 3.702 | NA | NA | NA | NA | NA | NA |
| 1750-03-01 | 5.626 | 3.076 | NA | NA | NA | NA | NA | NA |
| 1750-04-01 | 8.490 | 2.451 | NA | NA | NA | NA | NA | NA |
| 1750-05-01 | 11.573 | 2.072 | NA | NA | NA | NA | NA | NA |
| 1750-06-01 | 12.937 | 1.724 | NA | NA | NA | NA | NA | NA |
co2 %>% head() %>% kable()
| Year | Month | Decimal Date | Carbon Dioxide (ppm) | Seasonally Adjusted CO2 (ppm) | Carbon Dioxide Fit (ppm) | Seasonally Adjusted CO2 Fit (ppm) |
|---|---|---|---|---|---|---|
| 1958 | 1 | 1958.041 | NA | NA | NA | NA |
| 1958 | 2 | 1958.126 | NA | NA | NA | NA |
| 1958 | 3 | 1958.203 | 315.69 | 314.42 | 316.18 | 314.89 |
| 1958 | 4 | 1958.288 | 317.45 | 315.15 | 317.30 | 314.98 |
| 1958 | 5 | 1958.370 | 317.50 | 314.73 | 317.83 | 315.06 |
| 1958 | 6 | 1958.455 | NA | NA | 317.22 | 315.14 |
tempDF <- temp %>%
select(dt, LandAverageTemperature) %>%
rename(Date = dt)
co2DF <- co2 %>%
select(Year, Month, `Carbon Dioxide (ppm)`)
co2DF$day <- 1
co2DF$Date <- as.Date(with(co2DF, paste(Year, Month, day,sep="-")), "%Y-%m-%d")
co2DF <- co2DF %>%
select(Date, `Carbon Dioxide (ppm)`) %>%
rename(CO2 =`Carbon Dioxide (ppm)`)
Merging Data
df <- inner_join(tempDF, co2DF)
## Joining, by = "Date"
df$Month <- format.Date(df$Date, "%m") %>% as.numeric()
df <- df %>%
mutate(Month = month.abb[Month]) %>%
na.omit()
ggplot(df, aes(CO2, LandAverageTemperature, color=Month)) +
geom_point()+
theme_tufte()+
ggtitle('CO2 Levels and Temperature by Month')
There is unsurprisingly a very high monthly component to the temperature. This will require dummy variables to control for this and make our model more accurate.
mod <- lm(LandAverageTemperature~ CO2 + Month -1, df) # -1 to remove intercept and add all months
modSum <- summary(mod)
modSum
##
## Call:
## lm(formula = LandAverageTemperature ~ CO2 + Month - 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00165 -0.19220 0.01397 0.19264 1.00390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## CO2 0.0141950 0.0004695 30.231 < 2e-16 ***
## MonthApr 3.8388965 0.1711851 22.425 < 2e-16 ***
## MonthAug 9.2053816 0.1692868 54.377 < 2e-16 ***
## MonthDec -0.8496190 0.1697290 -5.006 7.11e-07 ***
## MonthFeb -1.3077137 0.1705851 -7.666 6.19e-14 ***
## MonthJan -1.8556168 0.1698909 -10.922 < 2e-16 ***
## MonthJul 9.6480623 0.1701942 56.689 < 2e-16 ***
## MonthJun 8.7777224 0.1712296 51.263 < 2e-16 ***
## MonthMar 0.7846249 0.1705761 4.600 5.05e-06 ***
## MonthMay 6.6383841 0.1711184 38.794 < 2e-16 ***
## MonthNov 1.4932754 0.1691369 8.829 < 2e-16 ***
## MonthOct 4.7676271 0.1688887 28.229 < 2e-16 ***
## MonthSep 7.4315996 0.1685262 44.098 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3121 on 676 degrees of freedom
## Multiple R-squared: 0.999, Adjusted R-squared: 0.999
## F-statistic: 5.392e+04 on 13 and 676 DF, p-value: < 2.2e-16
modDF <- augment(mod) # https://stackoverflow.com/questions/36731027/how-can-i-plot-the-residuals-of-lm-with-ggplot
ggplot(modDF, aes(.fitted, .resid)) +
geom_point() +
geom_hline(yintercept = 0) +
theme_tufte()
df2 = cbind(df, pred = predict(mod))
df2$Month = factor(df2$Month, levels = month.abb)
ggplot(df2, aes(CO2, LandAverageTemperature, color=Month)) +
geom_point()+
geom_line(mapping=aes(y = pred))+
theme_tufte()+
ggtitle('CO2 Levels and Temperature by Month')
Given the residual plot and the scatter plot with the regression lines for each dummy overlaid, it appears that the model is a good fit. Given the significance of the coefficient: 0.014195 with a pval of: 1.157520810^{-127}