R Markdown

library(tidyverse)
library(knitr)
library(broom)
library(ggthemes)
temp <- read_csv('DATA_605_Assignments/Data605Week11/GlobalTemperatures.csv')
co2 <- read_csv('DATA_605_Assignments/Data605Week11/co2.csv')

temp %>% head() %>% kable()
dt LandAverageTemperature LandAverageTemperatureUncertainty LandMaxTemperature LandMaxTemperatureUncertainty LandMinTemperature LandMinTemperatureUncertainty LandAndOceanAverageTemperature LandAndOceanAverageTemperatureUncertainty
1750-01-01 3.034 3.574 NA NA NA NA NA NA
1750-02-01 3.083 3.702 NA NA NA NA NA NA
1750-03-01 5.626 3.076 NA NA NA NA NA NA
1750-04-01 8.490 2.451 NA NA NA NA NA NA
1750-05-01 11.573 2.072 NA NA NA NA NA NA
1750-06-01 12.937 1.724 NA NA NA NA NA NA
co2 %>% head() %>% kable()
Year Month Decimal Date Carbon Dioxide (ppm) Seasonally Adjusted CO2 (ppm) Carbon Dioxide Fit (ppm) Seasonally Adjusted CO2 Fit (ppm)
1958 1 1958.041 NA NA NA NA
1958 2 1958.126 NA NA NA NA
1958 3 1958.203 315.69 314.42 316.18 314.89
1958 4 1958.288 317.45 315.15 317.30 314.98
1958 5 1958.370 317.50 314.73 317.83 315.06
1958 6 1958.455 NA NA 317.22 315.14

Wrangling the Data

tempDF <- temp %>% 
  select(dt, LandAverageTemperature) %>% 
  rename(Date = dt)
co2DF <- co2 %>% 
  select(Year, Month, `Carbon Dioxide (ppm)`)

co2DF$day <- 1
co2DF$Date <- as.Date(with(co2DF, paste(Year, Month, day,sep="-")), "%Y-%m-%d")
co2DF <- co2DF %>% 
  select(Date, `Carbon Dioxide (ppm)`) %>% 
  rename(CO2 =`Carbon Dioxide (ppm)`)

Merging Data

df <- inner_join(tempDF, co2DF)
## Joining, by = "Date"
df$Month <- format.Date(df$Date, "%m") %>% as.numeric()
df <- df %>% 
  mutate(Month = month.abb[Month]) %>% 
  na.omit()
ggplot(df, aes(CO2, LandAverageTemperature, color=Month)) +
  geom_point()+
  theme_tufte()+
  ggtitle('CO2 Levels and Temperature by Month')

There is unsurprisingly a very high monthly component to the temperature. This will require dummy variables to control for this and make our model more accurate.

mod <- lm(LandAverageTemperature~ CO2 + Month -1, df) # -1 to remove intercept and add all months
modSum <- summary(mod)
modSum
## 
## Call:
## lm(formula = LandAverageTemperature ~ CO2 + Month - 1, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00165 -0.19220  0.01397  0.19264  1.00390 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## CO2       0.0141950  0.0004695  30.231  < 2e-16 ***
## MonthApr  3.8388965  0.1711851  22.425  < 2e-16 ***
## MonthAug  9.2053816  0.1692868  54.377  < 2e-16 ***
## MonthDec -0.8496190  0.1697290  -5.006 7.11e-07 ***
## MonthFeb -1.3077137  0.1705851  -7.666 6.19e-14 ***
## MonthJan -1.8556168  0.1698909 -10.922  < 2e-16 ***
## MonthJul  9.6480623  0.1701942  56.689  < 2e-16 ***
## MonthJun  8.7777224  0.1712296  51.263  < 2e-16 ***
## MonthMar  0.7846249  0.1705761   4.600 5.05e-06 ***
## MonthMay  6.6383841  0.1711184  38.794  < 2e-16 ***
## MonthNov  1.4932754  0.1691369   8.829  < 2e-16 ***
## MonthOct  4.7676271  0.1688887  28.229  < 2e-16 ***
## MonthSep  7.4315996  0.1685262  44.098  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3121 on 676 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 5.392e+04 on 13 and 676 DF,  p-value: < 2.2e-16
modDF <- augment(mod) # https://stackoverflow.com/questions/36731027/how-can-i-plot-the-residuals-of-lm-with-ggplot
ggplot(modDF, aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  theme_tufte()

df2 = cbind(df, pred = predict(mod))
df2$Month = factor(df2$Month, levels = month.abb)
ggplot(df2, aes(CO2, LandAverageTemperature, color=Month)) +
  geom_point()+
  geom_line(mapping=aes(y = pred))+
  theme_tufte()+
  ggtitle('CO2 Levels and Temperature by Month')

Given the residual plot and the scatter plot with the regression lines for each dummy overlaid, it appears that the model is a good fit. Given the significance of the coefficient: 0.014195 with a pval of: 1.157520810^{-127}