##Midterm2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
co2<-read.csv("https://raw.githubusercontent.com/kitadasmalley/
MATH239/main/data/CO2_Emissions_Midterm.csv",
              sep=",", quote="\"",
              header = TRUE)
##Question1
str(co2)
## 'data.frame':    7385 obs. of  12 variables:
##  $ Make                            : Factor w/ 42 levels "ACURA","ALFA ROMEO",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Model                           : Factor w/ 2053 levels "124 Spider","124 SPIDER",..: 1058 1058 1059 1252 1524 1542 1801 1802 1802 1836 ...
##  $ Vehicle.Class                   : Factor w/ 16 levels "COMPACT","FULL-SIZE",..: 1 1 1 12 12 3 3 3 3 1 ...
##  $ Engine.Size.L.                  : num  2 2.4 1.5 3.5 3.5 3.5 3.5 3.7 3.7 2.4 ...
##  $ Cylinders                       : int  4 4 4 6 6 6 6 6 6 4 ...
##  $ Transmission                    : Factor w/ 27 levels "A10","A4","A5",..: 15 26 23 16 16 16 16 16 26 15 ...
##  $ Fuel.Type                       : Factor w/ 5 levels "D","E","N","X",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Fuel.Consumption.City..L.100.km.: num  9.9 11.2 6 12.7 12.1 11.9 11.8 12.8 13.4 10.6 ...
##  $ Fuel.Consumption.Hwy..L.100.km. : num  6.7 7.7 5.8 9.1 8.7 7.7 8.1 9 9.5 7.5 ...
##  $ Fuel.Consumption.Comb..L.100.km.: num  8.5 9.6 5.9 11.1 10.6 10 10.1 11.1 11.6 9.2 ...
##  $ Fuel.Consumption.Comb..mpg.     : int  33 29 48 25 27 28 28 25 24 31 ...
##  $ CO2.Emissions.g.km.             : int  196 221 136 255 244 230 232 255 267 212 ...
contrasts(co2$Fuel.Type)
##   E N X Z
## D 0 0 0 0
## E 1 0 0 0
## N 0 1 0 0
## X 0 0 1 0
## Z 0 0 0 1
 # 5 levels (D,E,N,X,Z) and R chose the reference level to be D according to the alphabetic order.
 # This does not make sense in the context of the data, the reference level should be X, the regular gasoline.
##Question2
 #If there are n(for Fuel.Type n=5) categories in a categorical variable, we make n-1 dummy variables(n=5, n-1=4). R uses the first level of the factor as a reference level which is all 0 for the n-1 dummy variables. Other factors will be shown 1 for the dummy variable they are, and 0 for other variables.
##Question3
co2$Fuel.Type <- factor(co2$Fuel.Type,
                        levels = c("X", "Z", "D", "E", "N"))
contrasts(co2$Fuel.Type)
##   Z D E N
## X 0 0 0 0
## Z 1 0 0 0
## D 0 1 0 0
## E 0 0 1 0
## N 0 0 0 1
##Question4
ggplot(co2, aes(y=CO2.Emissions.g.km., x=Fuel.Type, fill=Fuel.Type))+
         geom_boxplot()

 #Regular gasoline(X), Premium gasoline(Z), and Ethanol(E) seems to have some outliers and there is only one observation for Natural gas.
 #On average, Ethanol(E) seems to have the highest co2 emission
##Question5
mod1<-lm(CO2.Emissions.g.km.~Fuel.Type, data=co2)
summary(mod1)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Type, data = co2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.092  -42.119   -8.043   35.881  255.957 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 235.1193     0.9335 251.855   <2e-16 ***
## Fuel.TypeZ   30.9241     1.3643  22.666   <2e-16 ***
## Fuel.TypeD    2.4292     4.3571   0.558    0.577    
## Fuel.TypeE   39.9726     3.0722  13.011   <2e-16 ***
## Fuel.TypeN  -22.1193    56.3078  -0.393    0.694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.3 on 7380 degrees of freedom
## Multiple R-squared:  0.0747, Adjusted R-squared:  0.0742 
## F-statistic: 148.9 on 4 and 7380 DF,  p-value: < 2.2e-16
anova(mod1)
## Analysis of Variance Table
## 
## Response: CO2.Emissions.g.km.
##             Df   Sum Sq Mean Sq F value    Pr(>F)    
## Fuel.Type    4  1888452  472113  148.95 < 2.2e-16 ***
## Residuals 7380 23392397    3170                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 #H0: μ(X)=μ(Z)=μ(D)=μ(E)=μ(N) Ha: at least one μ is different
 #One-way ANOVA F-test
 #test statistic: F=148.95, reference distribution is the F-distribution
 #p-value=2.2*10^(-16) <0.05
 #We reject the null hypothesis with a p-value of 2.2*(10^(-16)) at the significance level of 0.05
 #There is convincing evidence to suggest that at least one μ(average of co2 emission) is different by the Fuel.Type.
##Question6
 #y=B0+B1*X1+B2*X2+...+ε
##Question7
 #assumptions for fitting a linear model
 #1.mean of random error is zero.
 #2.has a linear relationship(scatter)
 #3.no multicollinearity: variables are uncorrelated
 #4.homoscedasticity:variance of random error is equal
 #5.random error has normality
##Question8
co2%>%
  select(Engine.Size.L.,Cylinders,Fuel.Consumption.City..L.100.km.,Fuel.Consumption.Hwy..L.100.km.,Fuel.Consumption.Comb..L.100.km.,Fuel.Consumption.Comb..mpg.,CO2.Emissions.g.km.)%>%
  pairs()

 #Every explanatory variable seems to have a positive relationship close to a linear relationship with the CO2 Emissions, but the Fuel.Consumption.Comb..mpg. has a negative relationship and has a curve. 
 #Also, because of categorical variables, the graph between explanatory variables like Fuel.Consumption.City..L.100.km.,Fuel.Consumption.Hwy..L.100.km.,Fuel.Consumption.Comb..L.100.km.,Fuel.Consumption.Comb..mpg. and the co2 emission has 2 lines or curves.
##Question9
co2%>%
  select(Fuel.Consumption.Comb..L.100.km.,Fuel.Consumption.Comb..mpg.,CO2.Emissions.g.km.)%>%
  pairs()

 #I would include Fuel.Consumption.Comb..L.100.km. as an explanatory variable in my model because it seems to have a linear relationship with the response variable.
##Extra Credit
 #It is not appropriate to include Fuel Consumption City (L/100km), Fuel Consumption Hwy (L/100km), and Fuel Consumption Comb (L/100km) in the same model because since the hwy and city are combined there is a impact of multicollinearity which makes the VIF higher.
##Question10
ggplot(co2, aes(Fuel.Consumption.Comb..L.100.km.,CO2.Emissions.g.km.))+
  geom_point()

 #The direction is positive, it seems to have two to three lines and the strength of the lines seem moderately strong. There are some outliers between the lines and under the line.
##Question11
mod2<-lm(CO2.Emissions.g.km.~Fuel.Consumption.Comb..L.100.km.,co2)
summary(mod2)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km., 
##     data = co2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -141.619   -6.048    1.952   11.667   62.954 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      46.76315    1.05937   44.14   <2e-16 ***
## Fuel.Consumption.Comb..L.100.km. 18.57132    0.09334  198.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.2 on 7383 degrees of freedom
## Multiple R-squared:  0.8428, Adjusted R-squared:  0.8428 
## F-statistic: 3.959e+04 on 1 and 7383 DF,  p-value: < 2.2e-16
 #Fitted CO2 Emissions (g/km).= 46.76315+18.57132*Fuel.Consumption.Comb..L.100.km.
##Question12
 #H0: B1=0     Ha: B1!=0
 #We reject the null hypothesis with a p-value of 2*(10^(-16)) at the significance level of 0.05
 #t-test(t distribution with degree of freedom=7383), test statistic is t=198.97
 #There is convincing evidence to suggest that there is a significant linear relationship between the Fuel.Consumption.Comb..L.100.km. and the CO2 Emissions. 
##Question13
plot(mod2)

 #residual plot: the mean of the residuals are not perfectly zero but relatively close to zero. However, the residuals seem to have a pattern such as a linear pattern, so the the varience is not homogenous.
 #QQ plot: it is not close to a normal distribution, especially below -1 it does not have a normality.
 #leverage plot: no influential outliers because we cannot see the Cook's distance line in the graph which means all cases are well inside the Cook's distance line.
##Question14
ggplot(co2, aes(Fuel.Consumption.Comb..L.100.km.,CO2.Emissions.g.km., color=Fuel.Type))+
  geom_point()

 #X(Regular gasoline) and Z(Premium gasoline) seem to have a similar linear relation ship between the Fuel.Consumption.Comb..L.100.km.and the CO2.Emissions.g.km. D(Diesel) seems to have a steeper slope and E(Ethanol) seems to have a smaller slope than other fuel types.
##Question15
mod3<-lm(CO2.Emissions.g.km.~Fuel.Consumption.Comb..L.100.km.+Fuel.Type,co2)
summary(mod3)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km. + 
##     Fuel.Type, data = co2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.595  -2.760   0.045   2.234  44.852 
## 
## Coefficients:
##                                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                         5.34154    0.27768   19.236  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.   22.78507    0.02601  875.998  < 2e-16 ***
## Fuel.TypeZ                          0.43328    0.13763    3.148  0.00165 ** 
## Fuel.TypeD                         30.89114    0.42649   72.432  < 2e-16 ***
## Fuel.TypeE                       -114.43678    0.34782 -329.016  < 2e-16 ***
## Fuel.TypeN                        -81.71198    5.49603  -14.867  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.495 on 7379 degrees of freedom
## Multiple R-squared:  0.9912, Adjusted R-squared:  0.9912 
## F-statistic: 1.66e+05 on 5 and 7379 DF,  p-value: < 2.2e-16
 #X: y=5.34154+22.78507*x1    y=co2 emission x1=fuel consumption comb L.100km
 #Z: y=5.77482+22.78507*x1
 #D: y=36.23268+22.78507*x1
 #E: y=-109.0952+22.78507*x1
 #N: y=-76.37044+22.78507*x1
##Question16
ggplot(co2, aes(x=Fuel.Consumption.Comb..L.100.km., y=CO2.Emissions.g.km., color=Fuel.Type))+
  geom_point()+
  geom_abline(intercept=5.34154, slope=22.78507, col="red")+
  geom_abline(intercept=5.77482, slope=22.78507, col="yellow")+
  geom_abline(intercept=36.23268, slope=22.78507, col="green")+
  geom_abline(intercept=-109.0952, slope=22.78507, col="blue")+
  geom_abline(intercept=-76.37044, slope=22.78507, col="purple")

anova(mod3)
## Analysis of Variance Table
## 
## Response: CO2.Emissions.g.km.
##                                    Df   Sum Sq  Mean Sq F value    Pr(>F)    
## Fuel.Consumption.Comb..L.100.km.    1 21307172 21307172  705688 < 2.2e-16 ***
## Fuel.Type                           4  3750880   937720   31057 < 2.2e-16 ***
## Residuals                        7379   222797       30                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 #every shift of each Fuel.Type is significant in the significant level 0.05.
##Question17
mod4<-lm(CO2.Emissions.g.km.~Fuel.Consumption.Comb..L.100.km.*Fuel.Type,co2)
summary(mod4)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km. * 
##     Fuel.Type, data = co2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.127  -2.607   0.659   1.886  25.251 
## 
## Coefficients: (1 not defined because of singularities)
##                                              Estimate Std. Error  t value
## (Intercept)                                   0.42897    0.20641    2.078
## Fuel.Consumption.Comb..L.100.km.             23.27221    0.01988 1170.440
## Fuel.TypeZ                                    0.18413    0.32445    0.568
## Fuel.TypeD                                   -0.54618    1.30980   -0.417
## Fuel.TypeE                                    4.24637    0.92786    4.577
## Fuel.TypeN                                  -82.98605    2.95545  -28.079
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeZ  -0.03526    0.02923   -1.206
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeD   3.62697    0.14556   24.918
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeE  -7.23455    0.05649 -128.077
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeN        NA         NA       NA
##                                             Pr(>|t|)    
## (Intercept)                                   0.0377 *  
## Fuel.Consumption.Comb..L.100.km.             < 2e-16 ***
## Fuel.TypeZ                                    0.5704    
## Fuel.TypeD                                    0.6767    
## Fuel.TypeE                                   4.8e-06 ***
## Fuel.TypeN                                   < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeZ   0.2279    
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeD  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeE  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeN       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.955 on 7376 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
## F-statistic: 3.611e+05 on 8 and 7376 DF,  p-value: < 2.2e-16
 #X: CO2.Emissions.g.km.=0.42897+23.27221*(Fuel.Consumption.Comb..L.100.km)
 #Z: CO2.Emissions.g.km.=0.6131+23.23695*(Fuel.Consumption.Comb..L.100.km)
 #D: CO2.Emissions.g.km.=-0.11721+26.89918*(Fuel.Consumption.Comb..L.100.km)
 #E: CO2.Emissions.g.km.=4.67534+16.03766*(Fuel.Consumption.Comb..L.100.km)
 #N: NA
##Question18
ggplot(mod4, aes(x=Fuel.Consumption.Comb..L.100.km., y=CO2.Emissions.g.km., color=Fuel.Type))+
  geom_point()+
  geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

anova(mod4)
## Analysis of Variance Table
## 
## Response: CO2.Emissions.g.km.
##                                              Df   Sum Sq  Mean Sq   F value
## Fuel.Consumption.Comb..L.100.km.              1 21307172 21307172 2440799.3
## Fuel.Type                                     4  3750880   937720  107418.6
## Fuel.Consumption.Comb..L.100.km.:Fuel.Type    3   158408    52803    6048.7
## Residuals                                  7376    64389        9          
##                                               Pr(>F)    
## Fuel.Consumption.Comb..L.100.km.           < 2.2e-16 ***
## Fuel.Type                                  < 2.2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.Type < 2.2e-16 ***
## Residuals                                               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 #Fuel.Consumption.Comb..L.100.km., Fuel.Type,Fuel.Consumption.Comb..L.100.km.:Fuel.Type are all significant in a significant level of 0.05. 
#Question19
summary(mod2)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km., 
##     data = co2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -141.619   -6.048    1.952   11.667   62.954 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      46.76315    1.05937   44.14   <2e-16 ***
## Fuel.Consumption.Comb..L.100.km. 18.57132    0.09334  198.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.2 on 7383 degrees of freedom
## Multiple R-squared:  0.8428, Adjusted R-squared:  0.8428 
## F-statistic: 3.959e+04 on 1 and 7383 DF,  p-value: < 2.2e-16
summary(mod3)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km. + 
##     Fuel.Type, data = co2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.595  -2.760   0.045   2.234  44.852 
## 
## Coefficients:
##                                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                         5.34154    0.27768   19.236  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.   22.78507    0.02601  875.998  < 2e-16 ***
## Fuel.TypeZ                          0.43328    0.13763    3.148  0.00165 ** 
## Fuel.TypeD                         30.89114    0.42649   72.432  < 2e-16 ***
## Fuel.TypeE                       -114.43678    0.34782 -329.016  < 2e-16 ***
## Fuel.TypeN                        -81.71198    5.49603  -14.867  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.495 on 7379 degrees of freedom
## Multiple R-squared:  0.9912, Adjusted R-squared:  0.9912 
## F-statistic: 1.66e+05 on 5 and 7379 DF,  p-value: < 2.2e-16
summary(mod4)
## 
## Call:
## lm(formula = CO2.Emissions.g.km. ~ Fuel.Consumption.Comb..L.100.km. * 
##     Fuel.Type, data = co2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.127  -2.607   0.659   1.886  25.251 
## 
## Coefficients: (1 not defined because of singularities)
##                                              Estimate Std. Error  t value
## (Intercept)                                   0.42897    0.20641    2.078
## Fuel.Consumption.Comb..L.100.km.             23.27221    0.01988 1170.440
## Fuel.TypeZ                                    0.18413    0.32445    0.568
## Fuel.TypeD                                   -0.54618    1.30980   -0.417
## Fuel.TypeE                                    4.24637    0.92786    4.577
## Fuel.TypeN                                  -82.98605    2.95545  -28.079
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeZ  -0.03526    0.02923   -1.206
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeD   3.62697    0.14556   24.918
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeE  -7.23455    0.05649 -128.077
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeN        NA         NA       NA
##                                             Pr(>|t|)    
## (Intercept)                                   0.0377 *  
## Fuel.Consumption.Comb..L.100.km.             < 2e-16 ***
## Fuel.TypeZ                                    0.5704    
## Fuel.TypeD                                    0.6767    
## Fuel.TypeE                                   4.8e-06 ***
## Fuel.TypeN                                   < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeZ   0.2279    
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeD  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeE  < 2e-16 ***
## Fuel.Consumption.Comb..L.100.km.:Fuel.TypeN       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.955 on 7376 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
## F-statistic: 3.611e+05 on 8 and 7376 DF,  p-value: < 2.2e-16
 #adjusted R square of mod2 is 0.8428, adjusted R square of mod3 is 0.9912, adjusted R square of mod4 is 0.9975.
 #mod4(the last model) has the highest adjusted R square, so I would pick the last model.
 #This means the co2 emission has a relationship with the Fuel.Consumption.Comb..L.100.km. and the Fuel.Type. Also there is an interaction between these two explanatory variables. 
##Extra Credit Opportunities
newdata=data.frame(Fuel.Consumption.Comb..L.100.km.=25*235.21,Fuel.Type="X" )
predict(mod4,newdata)
## Warning in predict.lm(mod4, newdata): prediction from a rank-deficient fit may
## be misleading
##        1 
## 136846.8
predBand<-predict(mod4,newdata,interval="predict")
## Warning in predict.lm(mod4, newdata, interval = "predict"): prediction from a
## rank-deficient fit may be misleading
predBand
##        fit    lwr      upr
## 1 136846.8 136618 137075.7