census<-read.csv("Energy Census and Economic Data US 2010-2014.csv")
#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
attach(census)
group_by(census,State)
## # A tibble: 52 x 192
## # Groups: State [52]
## StateCodes State Region Division Coast Great.Lakes TotalC2010 TotalC2011
## <fct> <fct> <int> <int> <int> <int> <int> <int>
## 1 AL Alab~ 3 6 1 0 1931522 1905207
## 2 AK Alas~ 4 9 1 0 653221 653637
## 3 AZ Ariz~ 4 8 0 0 1383531 1424944
## 4 AR Arka~ 3 7 0 0 1120632 1122544
## 5 CA Cali~ 4 9 1 0 7760629 7777115
## 6 CO Colo~ 4 8 0 0 1513547 1470445
## 7 CT Conn~ 1 1 1 0 764970 739130
## 8 DE Dela~ 3 5 1 0 250212 272568
## 9 FL Flor~ 3 5 1 0 4282673 4141711
## 10 GA Geor~ 3 5 1 0 3100144 2982837
## # ... with 42 more rows, and 184 more variables: TotalC2012 <int>,
## # TotalC2013 <int>, TotalC2014 <int>, TotalP2010 <int>,
## # TotalP2011 <int>, TotalP2012 <int>, TotalP2013 <int>,
## # TotalP2014 <int>, TotalE2010 <dbl>, TotalE2011 <dbl>,
## # TotalE2012 <dbl>, TotalE2013 <dbl>, TotalE2014 <dbl>,
## # TotalPrice2010 <dbl>, TotalPrice2011 <dbl>, TotalPrice2012 <dbl>,
## # TotalPrice2013 <dbl>, TotalPrice2014 <dbl>, TotalC10.11 <dbl>,
## # TotalC11.12 <dbl>, TotalC12.13 <dbl>, TotalC13.14 <dbl>,
## # TotalP10.11 <dbl>, TotalP11.12 <dbl>, TotalP12.13 <dbl>,
## # TotalP13.14 <dbl>, TotalE10.11 <dbl>, TotalE11.12 <dbl>,
## # TotalE12.13 <dbl>, TotalE13.14 <dbl>, TotalPrice10.11 <dbl>,
## # TotalPrice11.12 <dbl>, TotalPrice12.13 <dbl>, TotalPrice13.14 <dbl>,
## # BiomassC2010 <int>, BiomassC2011 <int>, BiomassC2012 <int>,
## # BiomassC2013 <int>, BiomassC2014 <int>, CoalC2010 <int>,
## # CoalC2011 <int>, CoalC2012 <int>, CoalC2013 <int>, CoalC2014 <int>,
## # CoalP2010 <int>, CoalP2011 <int>, CoalP2012 <int>, CoalP2013 <int>,
## # CoalP2014 <int>, CoalE2010 <dbl>, CoalE2011 <dbl>, CoalE2012 <dbl>,
## # CoalE2013 <dbl>, CoalE2014 <dbl>, CoalPrice2010 <dbl>,
## # CoalPrice2011 <dbl>, CoalPrice2012 <dbl>, CoalPrice2013 <dbl>,
## # CoalPrice2014 <dbl>, ElecC2010 <int>, ElecC2011 <int>,
## # ElecC2012 <int>, ElecC2013 <int>, ElecC2014 <int>, ElecE2010 <dbl>,
## # ElecE2011 <dbl>, ElecE2012 <dbl>, ElecE2013 <dbl>, ElecE2014 <dbl>,
## # ElecPrice2010 <dbl>, ElecPrice2011 <dbl>, ElecPrice2012 <dbl>,
## # ElecPrice2013 <dbl>, ElecPrice2014 <dbl>, FossFuelC2010 <int>,
## # FossFuelC2011 <int>, FossFuelC2012 <int>, FossFuelC2013 <int>,
## # FossFuelC2014 <int>, GeoC2010 <int>, GeoC2011 <int>, GeoC2012 <int>,
## # GeoC2013 <int>, GeoC2014 <int>, GeoP2010 <int>, GeoP2011 <int>,
## # GeoP2012 <int>, GeoP2013 <int>, GeoP2014 <int>, HydroC2010 <int>,
## # HydroC2011 <int>, HydroC2012 <int>, HydroC2013 <int>,
## # HydroC2014 <int>, HydroP2010 <int>, HydroP2011 <int>,
## # HydroP2012 <int>, HydroP2013 <int>, HydroP2014 <int>,
## # NatGasC2010 <int>, ...
censusEditNL<-data.frame(State,
TotalC=c(TotalC2010,TotalC2011,TotalC2012,TotalC2013,TotalC2014),
TotalE=c(TotalE2010,TotalE2011,TotalE2012,TotalE2013,TotalE2014),
TotalP=c(TotalP2010,TotalP2011,TotalP2012,TotalP2013,TotalP2014),
TotalPrice=c(TotalPrice2010,TotalPrice2011,TotalPrice2012,TotalPrice2013,TotalPrice2014),
GDP=c(GDP2010,GDP2011,GDP2012,GDP2013,GDP2014),
FossFuelC=c(FossFuelC2010,FossFuelC2011,FossFuelC2012,FossFuelC2013,FossFuelC2014),
BiomassC=c(BiomassC2010,BiomassC2011,BiomassC2012,BiomassC2013,BiomassC2014),
CoalC=c(CoalC2010,CoalC2011,CoalC2012,CoalC2013,CoalC2014),
CoalE=c(CoalE2010,CoalE2011,CoalE2012,CoalE2013,CoalE2014),
CoalP=c(CoalP2010,CoalP2011,CoalP2012,CoalP2013,CoalP2014),
CoalPrice=c(CoalPrice2010,CoalPrice2011,CoalPrice2012,CoalPrice2013,CoalPrice2014),
ElecC=c(ElecC2010,ElecC2011,ElecC2012,ElecC2013,ElecC2014),
ElecE=c(ElecE2010,ElecE2011,ElecE2012,ElecE2013,ElecE2014),
ElecPrice=c(ElecPrice2010,ElecPrice2011,ElecPrice2012,ElecPrice2013,ElecPrice2014),
GeoC=c(GeoC2010,GeoC2011,GeoC2012,GeoC2013,GeoC2014),
GeoP=c(GeoP2010,GeoP2011,GeoP2012,GeoP2013,GeoP2014),
HydroC=c(HydroC2010,HydroC2011,HydroC2012,HydroC2013,HydroC2014),
HydroP=c(HydroP2010,HydroP2011,HydroP2012,HydroP2013,HydroP2014),
NatGasC=c(NatGasC2010,NatGasC2011,NatGasC2012,NatGasC2013,NatGasC2014),
NatGasE=c(NatGasE2010,NatGasE2011,NatGasE2012,NatGasE2013,NatGasE2014),
NatGasPrice=c(NatGasPrice2010,NatGasPrice2011,NatGasPrice2012,NatGasPrice2013,NatGasPrice2014),
LPGC=c(LPGC2010,LPGC2011,LPGC2012,LPGC2013,LPGC2014),
LPGE=c(LPGE2010,LPGE2011,LPGE2012,LPGE2013,LPGE2014),
LPGPrice=c(LPGPrice2010,LPGPrice2011,LPGPrice2012,LPGPrice2013,LPGPrice2014),
POPEST=c(POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014))
view(censusEditNL)
# write.csv(censusEditNL, file = "censusEditNL.csv")
censusEdit<-read.csv("censusEditNL.csv")
detach(census)
attach(censusEdit)
hist(GDP) #Data is heavily skewed, which seems to be the case for most of the data. Two more examples are provided (TotalC and TotaLE).
hist(TotalC)
hist(TotalE)
bootStrapCI1<-function(data, nsim){
n<-length(data)
bootCI<-c()
for(i in 1:nsim){
bootSamp<-sample(1:n, n, replace=TRUE)
thisXbar<-mean(data[bootSamp])
bootCI<-c(bootCI, thisXbar)
}
return(bootCI)
}
#Comparing original data CI with boostrapped data CI
t.test(GDP)
##
## One Sample t-test
##
## data: GDP
## t = 4.521, df = 259, p-value = 9.372e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 347733.8 884403.9
## sample estimates:
## mean of x
## 616068.8
GDP_Boot<-bootStrapCI1(GDP, nsim=1000)
hist(GDP_Boot)
t.test(GDP_Boot)
##
## One Sample t-test
##
## data: GDP_Boot
## t = 143.63, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 603728.1 620453.3
## sample estimates:
## mean of x
## 612090.7
t.test(TotalC)
##
## One Sample t-test
##
## data: TotalC
## t = 4.5429, df = 259, p-value = 8.513e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2110262 5339377
## sample estimates:
## mean of x
## 3724819
TotalC_Boot<-bootStrapCI1(TotalC, nsim=1000)
hist(TotalC_Boot)
t.test(TotalC_Boot)
##
## One Sample t-test
##
## data: TotalC_Boot
## t = 144.26, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 3646820 3747401
## sample estimates:
## mean of x
## 3697111
t.test(TotalP)
##
## One Sample t-test
##
## data: TotalP
## t = 4.3397, df = 259, p-value = 2.047e-05
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 1632169 4343810
## sample estimates:
## mean of x
## 2987990
TotalP_Boot<-bootStrapCI1(TotalP, nsim=1000)
hist(TotalP_Boot)
t.test(TotalP_Boot)
##
## One Sample t-test
##
## data: TotalP_Boot
## t = 144.98, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 2960496 3041739
## sample estimates:
## mean of x
## 3001117
t.test(TotalE)
##
## One Sample t-test
##
## data: TotalE
## t = 4.5355, df = 259, p-value = 8.794e-06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 29311.94 74293.99
## sample estimates:
## mean of x
## 51802.97
TotalE_Boot<-bootStrapCI1(TotalE, nsim=1000)
hist(TotalE_Boot)
t.test(TotalE_Boot)
##
## One Sample t-test
##
## data: TotalE_Boot
## t = 146.42, df = 999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 51240.69 52632.86
## sample estimates:
## mean of x
## 51936.77
Boot<-data.frame(GDP_Boot,TotalC_Boot,TotalP_Boot,TotalE_Boot)
# install.packages("splines")
library(splines)
set.seed(1)
# GDP as a response to TotalC
fit1=lm(GDP_Boot~poly(TotalC_Boot),data=Boot)
clims=range(TotalC_Boot)
c.grid=seq(from=clims[1],to=clims[2])
preds=predict(fit1,newdata =list(TotalC_Boot=c.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit=lm(GDP_Boot~bs(TotalC_Boot,knots=c(3100000,3700000,4200000)),data=Boot)
pred=predict(fit,newdata=list(TotalC_Boot=c.grid),se=T)
plot(TotalC_Boot,GDP_Boot,col="gray")
lines(c.grid,pred$fit,lwd=2)
lines(c.grid,pred$fit+2*pred$se,lty="dashed")
lines(c.grid,pred$fit-2*pred$se,lty="dashed")
#GDP as a response to TotalP
attr(bs(TotalP_Boot,df=6) ,"knots")
## 25% 50% 75%
## 2513142 2999437 3445557
fit2=lm(GDP_Boot~poly(TotalP_Boot),data=Boot)
plims=range(TotalP_Boot)
p.grid=seq(from=plims[1],to=plims[2])
preds=predict(fit2,newdata=list(TotalP_Boot=p.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit3=lm(GDP_Boot~bs(TotalP_Boot,knots=c(2500000, 300000, 3400000)),data=Boot)
pred=predict(fit3,newdata=list(TotalP_Boot=p.grid),se=T)
## Warning in predict.lm(fit3, newdata = list(TotalP_Boot = p.grid), se = T):
## prediction from a rank-deficient fit may be misleading
plot(TotalP_Boot,GDP_Boot,col="gray")
lines(p.grid,pred$fit,lwd=2)
lines(p.grid,pred$fit+2*pred$se,lty="dashed")
lines(p.grid,pred$fit-2*pred$se,lty="dashed")
# GDP as a response to TotalE
attr(bs(TotalE_Boot,df=6) ,"knots")
## 25% 50% 75%
## 43173.02 51546.21 59681.49
fit4=lm(GDP_Boot~poly(TotalE_Boot),data=Boot)
elims=range(TotalE_Boot)
e.grid=seq(from=elims[1],to=elims[2])
preds=predict(fit4,newdata=list(TotalE_Boot=e.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit5=lm(GDP_Boot~bs(TotalE_Boot,knots=c(44000, 51000, 59000)),data=Boot)
pred=predict(fit5,newdata=list(TotalE_Boot=e.grid),se=T)
plot(TotalE_Boot,GDP_Boot,col="gray")
lines(e.grid,pred$fit,lwd=2)
lines(e.grid,pred$fit+2*pred$se,lty="dashed")
lines(e.grid,pred$fit-2*pred$se,lty="dashed")
Coal
CoalC_Boot<-bootStrapCI1(CoalC, nsim=1000)
CoalP_Boot<-bootStrapCI1(CoalP, nsim=1000)
CoalE_Boot<-bootStrapCI1(CoalE, nsim=1000)
#CoalC
attr(bs(CoalC_Boot,df=6) ,"knots")
## 25% 50% 75%
## 602933.3 712181.3 830623.3
fit8=lm(GDP_Boot~poly(CoalC_Boot),data=Boot)
cclims=range(CoalC_Boot)
cc.grid=seq(from=cclims[1],to=cclims[2])
preds=predict(fit8,newdata=list(CoalC_Boot=cc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit9=lm(GDP_Boot~bs(CoalC_Boot,knots=c(600000, 700000, 820000)),data=Boot)
pred=predict(fit9,newdata=list(CoalC_Boot=cc.grid),se=T)
plot(CoalC_Boot,GDP_Boot,col="gray")
lines(cc.grid,pred$fit,lwd=2)
lines(cc.grid,pred$fit+2*pred$se,lty="dashed")
lines(cc.grid,pred$fit-2*pred$se,lty="dashed")
#CoalP
attr(bs(CoalP_Boot,df=6) ,"knots")
## 25% 50% 75%
## 672540.0 800064.1 928645.3
fit10=lm(GDP_Boot~poly(CoalP_Boot),data=Boot)
cplims=range(CoalP_Boot)
cp.grid=seq(from=cplims[1],to=cplims[2])
preds=predict(fit10,newdata=list(CoalP_Boot=cp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit11=lm(GDP_Boot~bs(CoalP_Boot,knots=c(670000, 790000, 930000)),data=Boot)
pred=predict(fit11,newdata=list(CoalP_Boot=cp.grid),se=T)
plot(CoalP_Boot,GDP_Boot,col="gray")
lines(cp.grid,pred$fit,lwd=2)
lines(cp.grid,pred$fit+2*pred$se,lty="dashed")
lines(cp.grid,pred$fit-2*pred$se,lty="dashed")
#CoalE
attr(bs(CoalE_Boot,df=6) ,"knots")
## 25% 50% 75%
## 1534.734 1771.294 2049.979
fit12=lm(GDP_Boot~poly(CoalE_Boot),data=Boot)
celims=range(CoalE_Boot)
ce.grid=seq(from=celims[1],to=celims[2])
preds=predict(fit12,newdata=list(CoalE_Boot=ce.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit13=lm(GDP_Boot~bs(CoalE_Boot,knots=c(1500, 1800, 2000)),data=Boot)
pred=predict(fit13,newdata=list(CoalE_Boot=ce.grid),se=T)
plot(CoalE_Boot,GDP_Boot,col="gray")
lines(ce.grid,pred$fit,lwd=2)
lines(ce.grid,pred$fit+2*pred$se,lty="dashed")
lines(ce.grid,pred$fit-2*pred$se,lty="dashed")
fossil fuels
#Fossil Fuel
FossFuelC_Boot<-bootStrapCI1(FossFuelC, nsim=1000)
Boot<-data.frame(GDP_Boot,TotalC_Boot,TotalP_Boot,TotalE_Boot,CoalC_Boot,CoalP_Boot,CoalE_Boot,FossFuelC_Boot) #Updating the Boot dataset to include the fossil fuel variable
attr(bs(FossFuelC_Boot,df=6) ,"knots")
## 25% 50% 75%
## 2566919 2985538 3398937
fit16=lm(GDP_Boot~poly(FossFuelC_Boot),data=Boot)
ffclims=range(FossFuelC_Boot)
ffc.grid=seq(from=ffclims[1],to=ffclims[2])
preds=predict(fit16,newdata=list(FossFuelC_Boot=ffc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit17=lm(GDP_Boot~bs(FossFuelC_Boot,knots=c(2500000, 3000000, 3500000)),data=Boot)
pred=predict(fit17,newdata=list(FossFuelC_Boot=ffc.grid),se=T)
plot(FossFuelC_Boot,GDP_Boot,col="gray")
lines(ffc.grid,pred$fit,lwd=2)
lines(ffc.grid,pred$fit+2*pred$se,lty="dashed")
lines(ffc.grid,pred$fit-2*pred$se,lty="dashed")
electricity
ElecC_Boot<-bootStrapCI1(ElecC, nsim=1000)
ElecE_Boot<-bootStrapCI1(ElecE, nsim=1000)
#ElecC
attr(bs(ElecC_Boot,df=6) ,"knots")
## 25% 50% 75%
## 412357.9 484980.5 552505.8
fit18=lm(GDP_Boot~poly(ElecC_Boot),data=Boot)
eclims=range(ElecC_Boot)
ec.grid=seq(from=eclims[1],to=eclims[2])
preds=predict(fit18,newdata=list(ElecC_Boot=ec.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit19=lm(GDP_Boot~bs(ElecC_Boot,knots=c(420000, 490000, 580000 )),data=Boot)
pred=predict(fit19,newdata=list(ElecC_Boot=ec.grid),se=T)
plot(ElecC_Boot,GDP_Boot,col="gray")
lines(ec.grid,pred$fit,lwd=2)
lines(ec.grid,pred$fit+2*pred$se,lty="dashed")
lines(ec.grid,pred$fit-2*pred$se,lty="dashed")
#ElecE
attr(bs(ElecE_Boot,df=6) ,"knots")
## 25% 50% 75%
## 12254.01 14077.60 16201.15
fit20=lm(GDP_Boot~poly(ElecE_Boot),data=Boot)
eelims=range(ElecE_Boot)
ee.grid=seq(from=eelims[1],to=eelims[2])
preds=predict(fit20,newdata=list(ElecE_Boot=ee.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit21=lm(GDP_Boot~bs(ElecE_Boot,knots=c(12000, 14000, 16000 )),data=Boot)
pred=predict(fit21,newdata=list(ElecE_Boot=ee.grid),se=T)
plot(ElecE_Boot,GDP_Boot,col="gray")
lines(ee.grid,pred$fit,lwd=2)
lines(ee.grid,pred$fit+2*pred$se,lty="dashed")
lines(ee.grid,pred$fit-2*pred$se,lty="dashed")
hydraulic power
HydroC_Boot<-bootStrapCI1(HydroC, nsim=1000)
HydroP_Boot<-bootStrapCI1(HydroP, nsim=1000)
#Hydro C
attr(bs(HydroC_Boot,df=6) ,"knots")
## 25% 50% 75%
## 85507.97 99421.10 116514.12
fit22=lm(GDP_Boot~poly(HydroC_Boot),data=Boot)
hclims=range(HydroC_Boot)
hc.grid=seq(from=hclims[1],to=hclims[2])
preds=predict(fit22,newdata=list(HydroC_Boot=hc.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit23=lm(GDP_Boot~bs(HydroC_Boot,knots=c(86000, 101000, 117000)),data=Boot)
pred=predict(fit23,newdata=list(HydroC_Boot=hc.grid),se=T)
plot(HydroC_Boot,GDP_Boot,col="gray")
lines(hc.grid,pred$fit,lwd=2)
lines(hc.grid,pred$fit+2*pred$se,lty="dashed")
lines(hc.grid,pred$fit-2*pred$se,lty="dashed")
# HydroP
attr(bs(HydroP_Boot,df=6) ,"knots")
## 25% 50% 75%
## 8818.683 10405.898 12204.325
fit24=lm(GDP_Boot~poly(HydroP_Boot),data=Boot)
hplims=range(HydroP_Boot)
hp.grid=seq(from=hplims[1],to=hplims[2])
preds=predict(fit22,newdata=list(HydroP_Boot=hp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
fit25=lm(GDP_Boot~bs(HydroP_Boot,knots=c(9000, 11000, 12000)),data=Boot)
pred=predict(fit25,newdata=list(HydroP_Boot=hp.grid),se=T)
plot(HydroP_Boot,GDP_Boot,col="gray")
lines(hp.grid,pred$fit,lwd=2)
lines(hp.grid,pred$fit+2*pred$se,lty="dashed")
lines(hp.grid,pred$fit-2*pred$se,lty="dashed")