Data wrangling

census<-read.csv("Energy Census and Economic Data US 2010-2014.csv")

#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
attach(census)

group_by(census,State)
## # A tibble: 52 x 192
## # Groups:   State [52]
##    StateCodes State Region Division Coast Great.Lakes TotalC2010 TotalC2011
##    <fct>      <fct>  <int>    <int> <int>       <int>      <int>      <int>
##  1 AL         Alab~      3        6     1           0    1931522    1905207
##  2 AK         Alas~      4        9     1           0     653221     653637
##  3 AZ         Ariz~      4        8     0           0    1383531    1424944
##  4 AR         Arka~      3        7     0           0    1120632    1122544
##  5 CA         Cali~      4        9     1           0    7760629    7777115
##  6 CO         Colo~      4        8     0           0    1513547    1470445
##  7 CT         Conn~      1        1     1           0     764970     739130
##  8 DE         Dela~      3        5     1           0     250212     272568
##  9 FL         Flor~      3        5     1           0    4282673    4141711
## 10 GA         Geor~      3        5     1           0    3100144    2982837
## # ... with 42 more rows, and 184 more variables: TotalC2012 <int>,
## #   TotalC2013 <int>, TotalC2014 <int>, TotalP2010 <int>,
## #   TotalP2011 <int>, TotalP2012 <int>, TotalP2013 <int>,
## #   TotalP2014 <int>, TotalE2010 <dbl>, TotalE2011 <dbl>,
## #   TotalE2012 <dbl>, TotalE2013 <dbl>, TotalE2014 <dbl>,
## #   TotalPrice2010 <dbl>, TotalPrice2011 <dbl>, TotalPrice2012 <dbl>,
## #   TotalPrice2013 <dbl>, TotalPrice2014 <dbl>, TotalC10.11 <dbl>,
## #   TotalC11.12 <dbl>, TotalC12.13 <dbl>, TotalC13.14 <dbl>,
## #   TotalP10.11 <dbl>, TotalP11.12 <dbl>, TotalP12.13 <dbl>,
## #   TotalP13.14 <dbl>, TotalE10.11 <dbl>, TotalE11.12 <dbl>,
## #   TotalE12.13 <dbl>, TotalE13.14 <dbl>, TotalPrice10.11 <dbl>,
## #   TotalPrice11.12 <dbl>, TotalPrice12.13 <dbl>, TotalPrice13.14 <dbl>,
## #   BiomassC2010 <int>, BiomassC2011 <int>, BiomassC2012 <int>,
## #   BiomassC2013 <int>, BiomassC2014 <int>, CoalC2010 <int>,
## #   CoalC2011 <int>, CoalC2012 <int>, CoalC2013 <int>, CoalC2014 <int>,
## #   CoalP2010 <int>, CoalP2011 <int>, CoalP2012 <int>, CoalP2013 <int>,
## #   CoalP2014 <int>, CoalE2010 <dbl>, CoalE2011 <dbl>, CoalE2012 <dbl>,
## #   CoalE2013 <dbl>, CoalE2014 <dbl>, CoalPrice2010 <dbl>,
## #   CoalPrice2011 <dbl>, CoalPrice2012 <dbl>, CoalPrice2013 <dbl>,
## #   CoalPrice2014 <dbl>, ElecC2010 <int>, ElecC2011 <int>,
## #   ElecC2012 <int>, ElecC2013 <int>, ElecC2014 <int>, ElecE2010 <dbl>,
## #   ElecE2011 <dbl>, ElecE2012 <dbl>, ElecE2013 <dbl>, ElecE2014 <dbl>,
## #   ElecPrice2010 <dbl>, ElecPrice2011 <dbl>, ElecPrice2012 <dbl>,
## #   ElecPrice2013 <dbl>, ElecPrice2014 <dbl>, FossFuelC2010 <int>,
## #   FossFuelC2011 <int>, FossFuelC2012 <int>, FossFuelC2013 <int>,
## #   FossFuelC2014 <int>, GeoC2010 <int>, GeoC2011 <int>, GeoC2012 <int>,
## #   GeoC2013 <int>, GeoC2014 <int>, GeoP2010 <int>, GeoP2011 <int>,
## #   GeoP2012 <int>, GeoP2013 <int>, GeoP2014 <int>, HydroC2010 <int>,
## #   HydroC2011 <int>, HydroC2012 <int>, HydroC2013 <int>,
## #   HydroC2014 <int>, HydroP2010 <int>, HydroP2011 <int>,
## #   HydroP2012 <int>, HydroP2013 <int>, HydroP2014 <int>,
## #   NatGasC2010 <int>, ...
censusEditNL<-data.frame(State,
                         TotalC=c(TotalC2010,TotalC2011,TotalC2012,TotalC2013,TotalC2014),
                         TotalE=c(TotalE2010,TotalE2011,TotalE2012,TotalE2013,TotalE2014),
                         TotalP=c(TotalP2010,TotalP2011,TotalP2012,TotalP2013,TotalP2014),
                         TotalPrice=c(TotalPrice2010,TotalPrice2011,TotalPrice2012,TotalPrice2013,TotalPrice2014),
                         GDP=c(GDP2010,GDP2011,GDP2012,GDP2013,GDP2014),
                         FossFuelC=c(FossFuelC2010,FossFuelC2011,FossFuelC2012,FossFuelC2013,FossFuelC2014),
                         BiomassC=c(BiomassC2010,BiomassC2011,BiomassC2012,BiomassC2013,BiomassC2014),
                         CoalC=c(CoalC2010,CoalC2011,CoalC2012,CoalC2013,CoalC2014),
                         CoalE=c(CoalE2010,CoalE2011,CoalE2012,CoalE2013,CoalE2014),
                         CoalP=c(CoalP2010,CoalP2011,CoalP2012,CoalP2013,CoalP2014),
                         CoalPrice=c(CoalPrice2010,CoalPrice2011,CoalPrice2012,CoalPrice2013,CoalPrice2014),
                         ElecC=c(ElecC2010,ElecC2011,ElecC2012,ElecC2013,ElecC2014),
                         ElecE=c(ElecE2010,ElecE2011,ElecE2012,ElecE2013,ElecE2014),
                         ElecPrice=c(ElecPrice2010,ElecPrice2011,ElecPrice2012,ElecPrice2013,ElecPrice2014),
                         GeoC=c(GeoC2010,GeoC2011,GeoC2012,GeoC2013,GeoC2014),
                         GeoP=c(GeoP2010,GeoP2011,GeoP2012,GeoP2013,GeoP2014),
                         HydroC=c(HydroC2010,HydroC2011,HydroC2012,HydroC2013,HydroC2014),
                         HydroP=c(HydroP2010,HydroP2011,HydroP2012,HydroP2013,HydroP2014),
                         NatGasC=c(NatGasC2010,NatGasC2011,NatGasC2012,NatGasC2013,NatGasC2014),
                         NatGasE=c(NatGasE2010,NatGasE2011,NatGasE2012,NatGasE2013,NatGasE2014),
                         NatGasPrice=c(NatGasPrice2010,NatGasPrice2011,NatGasPrice2012,NatGasPrice2013,NatGasPrice2014),
                         LPGC=c(LPGC2010,LPGC2011,LPGC2012,LPGC2013,LPGC2014),
                         LPGE=c(LPGE2010,LPGE2011,LPGE2012,LPGE2013,LPGE2014),
                         LPGPrice=c(LPGPrice2010,LPGPrice2011,LPGPrice2012,LPGPrice2013,LPGPrice2014),
                         POPEST=c(POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014))

view(censusEditNL)

# write.csv(censusEditNL, file = "censusEditNL.csv")

censusEdit<-read.csv("censusEditNL.csv")
detach(census)
attach(censusEdit)

Bootstrapping data

hist(GDP) #Data is heavily skewed, which seems to be the case for most of the data. Two more examples are provided (TotalC and TotaLE).

hist(TotalC)

hist(TotalE)

bootStrapCI1<-function(data, nsim){
  n<-length(data)
  bootCI<-c()
  
  for(i in 1:nsim){
    bootSamp<-sample(1:n, n, replace=TRUE)
    thisXbar<-mean(data[bootSamp])
    bootCI<-c(bootCI, thisXbar)
  }
  return(bootCI)
}

#Comparing original data CI with boostrapped data CI
t.test(GDP)
## 
##  One Sample t-test
## 
## data:  GDP
## t = 4.521, df = 259, p-value = 9.372e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  347733.8 884403.9
## sample estimates:
## mean of x 
##  616068.8
GDP_Boot<-bootStrapCI1(GDP, nsim=1000)
hist(GDP_Boot)

t.test(GDP_Boot)
## 
##  One Sample t-test
## 
## data:  GDP_Boot
## t = 143.63, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  603728.1 620453.3
## sample estimates:
## mean of x 
##  612090.7
t.test(TotalC)
## 
##  One Sample t-test
## 
## data:  TotalC
## t = 4.5429, df = 259, p-value = 8.513e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2110262 5339377
## sample estimates:
## mean of x 
##   3724819
TotalC_Boot<-bootStrapCI1(TotalC, nsim=1000)
hist(TotalC_Boot)

t.test(TotalC_Boot)
## 
##  One Sample t-test
## 
## data:  TotalC_Boot
## t = 144.26, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  3646820 3747401
## sample estimates:
## mean of x 
##   3697111
t.test(TotalP)
## 
##  One Sample t-test
## 
## data:  TotalP
## t = 4.3397, df = 259, p-value = 2.047e-05
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1632169 4343810
## sample estimates:
## mean of x 
##   2987990
TotalP_Boot<-bootStrapCI1(TotalP, nsim=1000)
hist(TotalP_Boot)

t.test(TotalP_Boot)
## 
##  One Sample t-test
## 
## data:  TotalP_Boot
## t = 144.98, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2960496 3041739
## sample estimates:
## mean of x 
##   3001117
t.test(TotalE)
## 
##  One Sample t-test
## 
## data:  TotalE
## t = 4.5355, df = 259, p-value = 8.794e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  29311.94 74293.99
## sample estimates:
## mean of x 
##  51802.97
TotalE_Boot<-bootStrapCI1(TotalE, nsim=1000)
hist(TotalE_Boot)

t.test(TotalE_Boot)
## 
##  One Sample t-test
## 
## data:  TotalE_Boot
## t = 146.42, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  51240.69 52632.86
## sample estimates:
## mean of x 
##  51936.77
Boot<-data.frame(GDP_Boot,TotalC_Boot,TotalP_Boot,TotalE_Boot)

Exploring the relationship between GDP and specific types of energy

Coal

CoalC_Boot<-bootStrapCI1(CoalC, nsim=1000)
CoalP_Boot<-bootStrapCI1(CoalP, nsim=1000)
CoalE_Boot<-bootStrapCI1(CoalE, nsim=1000)

#CoalC
attr(bs(CoalC_Boot,df=6) ,"knots")
##      25%      50%      75% 
## 602933.3 712181.3 830623.3
fit8=lm(GDP_Boot~poly(CoalC_Boot),data=Boot)

cclims=range(CoalC_Boot)
cc.grid=seq(from=cclims[1],to=cclims[2])
preds=predict(fit8,newdata=list(CoalC_Boot=cc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit9=lm(GDP_Boot~bs(CoalC_Boot,knots=c(600000, 700000, 820000)),data=Boot)
pred=predict(fit9,newdata=list(CoalC_Boot=cc.grid),se=T)
plot(CoalC_Boot,GDP_Boot,col="gray")
lines(cc.grid,pred$fit,lwd=2)
lines(cc.grid,pred$fit+2*pred$se,lty="dashed")
lines(cc.grid,pred$fit-2*pred$se,lty="dashed")

#CoalP
attr(bs(CoalP_Boot,df=6) ,"knots")
##      25%      50%      75% 
## 672540.0 800064.1 928645.3
fit10=lm(GDP_Boot~poly(CoalP_Boot),data=Boot)

cplims=range(CoalP_Boot)
cp.grid=seq(from=cplims[1],to=cplims[2])
preds=predict(fit10,newdata=list(CoalP_Boot=cp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit11=lm(GDP_Boot~bs(CoalP_Boot,knots=c(670000, 790000, 930000)),data=Boot)
pred=predict(fit11,newdata=list(CoalP_Boot=cp.grid),se=T)
plot(CoalP_Boot,GDP_Boot,col="gray")
lines(cp.grid,pred$fit,lwd=2)
lines(cp.grid,pred$fit+2*pred$se,lty="dashed")
lines(cp.grid,pred$fit-2*pred$se,lty="dashed")

#CoalE
attr(bs(CoalE_Boot,df=6) ,"knots")
##      25%      50%      75% 
## 1534.734 1771.294 2049.979
fit12=lm(GDP_Boot~poly(CoalE_Boot),data=Boot)

celims=range(CoalE_Boot)
ce.grid=seq(from=celims[1],to=celims[2])
preds=predict(fit12,newdata=list(CoalE_Boot=ce.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit13=lm(GDP_Boot~bs(CoalE_Boot,knots=c(1500, 1800, 2000)),data=Boot)
pred=predict(fit13,newdata=list(CoalE_Boot=ce.grid),se=T)
plot(CoalE_Boot,GDP_Boot,col="gray")
lines(ce.grid,pred$fit,lwd=2)
lines(ce.grid,pred$fit+2*pred$se,lty="dashed")
lines(ce.grid,pred$fit-2*pred$se,lty="dashed")

fossil fuels

#Fossil Fuel 
FossFuelC_Boot<-bootStrapCI1(FossFuelC, nsim=1000)
Boot<-data.frame(GDP_Boot,TotalC_Boot,TotalP_Boot,TotalE_Boot,CoalC_Boot,CoalP_Boot,CoalE_Boot,FossFuelC_Boot) #Updating the Boot dataset to include the fossil fuel variable

attr(bs(FossFuelC_Boot,df=6) ,"knots")
##     25%     50%     75% 
## 2566919 2985538 3398937
fit16=lm(GDP_Boot~poly(FossFuelC_Boot),data=Boot)

ffclims=range(FossFuelC_Boot)
ffc.grid=seq(from=ffclims[1],to=ffclims[2])
preds=predict(fit16,newdata=list(FossFuelC_Boot=ffc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit17=lm(GDP_Boot~bs(FossFuelC_Boot,knots=c(2500000, 3000000, 3500000)),data=Boot)
pred=predict(fit17,newdata=list(FossFuelC_Boot=ffc.grid),se=T)
plot(FossFuelC_Boot,GDP_Boot,col="gray")
lines(ffc.grid,pred$fit,lwd=2)
lines(ffc.grid,pred$fit+2*pred$se,lty="dashed")
lines(ffc.grid,pred$fit-2*pred$se,lty="dashed")

electricity

ElecC_Boot<-bootStrapCI1(ElecC, nsim=1000)
ElecE_Boot<-bootStrapCI1(ElecE, nsim=1000)

#ElecC
attr(bs(ElecC_Boot,df=6) ,"knots")
##      25%      50%      75% 
## 412357.9 484980.5 552505.8
fit18=lm(GDP_Boot~poly(ElecC_Boot),data=Boot)

eclims=range(ElecC_Boot)
ec.grid=seq(from=eclims[1],to=eclims[2])
preds=predict(fit18,newdata=list(ElecC_Boot=ec.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit19=lm(GDP_Boot~bs(ElecC_Boot,knots=c(420000, 490000, 580000 )),data=Boot)
pred=predict(fit19,newdata=list(ElecC_Boot=ec.grid),se=T)
plot(ElecC_Boot,GDP_Boot,col="gray")
lines(ec.grid,pred$fit,lwd=2)
lines(ec.grid,pred$fit+2*pred$se,lty="dashed")
lines(ec.grid,pred$fit-2*pred$se,lty="dashed")

#ElecE
attr(bs(ElecE_Boot,df=6) ,"knots")
##      25%      50%      75% 
## 12254.01 14077.60 16201.15
fit20=lm(GDP_Boot~poly(ElecE_Boot),data=Boot)

eelims=range(ElecE_Boot)
ee.grid=seq(from=eelims[1],to=eelims[2])
preds=predict(fit20,newdata=list(ElecE_Boot=ee.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit21=lm(GDP_Boot~bs(ElecE_Boot,knots=c(12000, 14000, 16000 )),data=Boot)
pred=predict(fit21,newdata=list(ElecE_Boot=ee.grid),se=T)
plot(ElecE_Boot,GDP_Boot,col="gray")
lines(ee.grid,pred$fit,lwd=2)
lines(ee.grid,pred$fit+2*pred$se,lty="dashed")
lines(ee.grid,pred$fit-2*pred$se,lty="dashed")

hydraulic power

HydroC_Boot<-bootStrapCI1(HydroC, nsim=1000)
HydroP_Boot<-bootStrapCI1(HydroP, nsim=1000)

#Hydro C
attr(bs(HydroC_Boot,df=6) ,"knots")
##       25%       50%       75% 
##  85507.97  99421.10 116514.12
fit22=lm(GDP_Boot~poly(HydroC_Boot),data=Boot)

hclims=range(HydroC_Boot)
hc.grid=seq(from=hclims[1],to=hclims[2])
preds=predict(fit22,newdata=list(HydroC_Boot=hc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit23=lm(GDP_Boot~bs(HydroC_Boot,knots=c(86000, 101000, 117000)),data=Boot)
pred=predict(fit23,newdata=list(HydroC_Boot=hc.grid),se=T)
plot(HydroC_Boot,GDP_Boot,col="gray")
lines(hc.grid,pred$fit,lwd=2)
lines(hc.grid,pred$fit+2*pred$se,lty="dashed")
lines(hc.grid,pred$fit-2*pred$se,lty="dashed")

# HydroP
attr(bs(HydroP_Boot,df=6) ,"knots")
##       25%       50%       75% 
##  8818.683 10405.898 12204.325
fit24=lm(GDP_Boot~poly(HydroP_Boot),data=Boot)

hplims=range(HydroP_Boot)
hp.grid=seq(from=hplims[1],to=hplims[2])
preds=predict(fit22,newdata=list(HydroP_Boot=hp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1))
fit25=lm(GDP_Boot~bs(HydroP_Boot,knots=c(9000, 11000, 12000)),data=Boot)
pred=predict(fit25,newdata=list(HydroP_Boot=hp.grid),se=T)
plot(HydroP_Boot,GDP_Boot,col="gray")
lines(hp.grid,pred$fit,lwd=2)
lines(hp.grid,pred$fit+2*pred$se,lty="dashed")
lines(hp.grid,pred$fit-2*pred$se,lty="dashed")