library(tidyverse)
library(ggplot2)
library(plotly)
library(mosaic)
library(dplyr)
library(ggpubr)
library("rcompanion")
library(e1071)
library(rstatix)
AHSCdata <- read.csv("AHSC.csv")
#Cleaning data
AHSC <- AHSCdata %>%
rename(
Project_Area_Type = Project.Area.Type,
Total_MTCO2e = Total.MTCO2e..
) %>%
mutate(Transit_GHG_Emission_Reduction_Percentage = as.numeric(Transit.GHG.Emission.Reductions..MTCO2e.)/as.numeric(Total.GHG.Emission.Reductions..MTCO2e.)) %>%
mutate(Project_Area_Improvements = Total.GHG.Emission.Reductions..MTCO2e. - Transit.GHG.Emission.Reductions..MTCO2e.)
Null and Alternative hypothesis
H0: The mean outcome for the total Green House Gases Emission Reductions in metric tons of carbon dioxide per the amount of AHSC GGRP Funding Requested is the same across all Project Area Types.
HA: At least one mean is different
#Precentage of each Project Area Type
ProjectAreaTypeTable <- table(AHSC$Project_Area_Type)
prop.table(ProjectAreaTypeTable)*100
##
## ICP RIPA TOD
## 48.93617 21.27660 29.78723
favstats(AHSC$Total_MTCO2e)
## min Q1 median Q3 max mean sd n
## 0.000176 0.0006025 0.000946 0.001374 0.002738 0.00102166 0.0005970589 47
## missing
## 0
favstats(AHSC$Total_MTCO2e~AHSC$Project_Area_Type)
## AHSC$Project_Area_Type min Q1 median Q3 max
## 1 ICP 0.000229 0.0005830 0.000893 0.00127150 0.001982
## 2 RIPA 0.000176 0.0004815 0.000681 0.00122600 0.001399
## 3 TOD 0.000232 0.0007820 0.001028 0.00181475 0.002738
## mean sd n missing
## 1 0.0009717826 0.0005081106 23 0
## 2 0.0007910000 0.0004588660 10 0
## 3 0.0012683571 0.0007535482 14 0
Sum = groupwiseMean(Total_MTCO2e~Project_Area_Type, data = AHSC, conf = 0.95, digits=3)
Sum
## Project_Area_Type n Mean Conf.level Trad.lower Trad.upper
## 1 ICP 23 0.000972 0.95 0.000752 0.00119
## 2 RIPA 10 0.000791 0.95 0.000463 0.00112
## 3 TOD 14 0.001270 0.95 0.000833 0.00170
ggline(AHSC, x = "Project_Area_Type", y = "Total_MTCO2e",
add = c("mean_se", "jitter"),
order = c("ICP", "RIPA", "TOD"), xlab="Project Area Type", ylab="Total MTCO2e/$")
Trad.Upper and Trad.Lower tells you the confidence intervals
#All the histograms together
par(mfrow=c(1,3), pty="s")
#Histogram of ICP
hist(AHSC$Total_MTCO2e[AHSC$Project_Area_Type=="ICP"], main = "ICP", xlab="ICP" )
#Histogram of RIPA
hist(AHSC$Total_MTCO2e[AHSC$Project_Area_Type=="RIPA"], main = "RIPA", xlab="RIPA")
#Histogram of TOD
hist(AHSC$Total_MTCO2e[AHSC$Project_Area_Type=="TOD"], main = "TOD", xlab="TOD")
Histogram of ICP: The distribution is right screwed
Histogram of RIPA: The distribution is random. There is no clear process or pattern
Histogram of TOD: The distribution is random. There is no clear process or pattern
Sample Independence: Each sample is drawn independently of the other samples.
Variance Equality: The variance for all the Project Area Types vary. If you look back at the box plots for each project area, you can visually see the different variances. Numerically, the standard deviations of each project area are not close to each other.
Normality: The project area groups are not taken from a normally distributed population. If you look at the histograms, you see that only the ICP group shows some normality, however the other two, RIPA and TOD, are randomly distributed. Since each group have n less than 30, central tendency theorem cannot be implied.
kruskal.test(Total_MTCO2e~Project_Area_Type, data=AHSC)
##
## Kruskal-Wallis rank sum test
##
## data: Total_MTCO2e by Project_Area_Type
## Kruskal-Wallis chi-squared = 2.6611, df = 2, p-value = 0.2643
new <- AHSC%>%
dunn_test(Total_MTCO2e~Project_Area_Type, p.adjust.method="bonferroni")
new
## # A tibble: 3 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Total_MTCO2e ICP RIPA 23 10 -0.784 0.433 1 ns
## 2 Total_MTCO2e ICP TOD 23 14 1.08 0.280 0.839 ns
## 3 Total_MTCO2e RIPA TOD 10 14 1.60 0.109 0.328 ns
new <- new %>%
add_xy_position(x="Project_Area_Type")
AHSC.kruskal <- kruskal_test(Total_MTCO2e~Project_Area_Type, data=AHSC)
ggboxplot(AHSC, x="Project_Area_Type", y="Total_MTCO2e")+
stat_pvalue_manual(new, hide.ns=TRUE)+
labs(subtitle = get_test_label(AHSC.kruskal, detailed=TRUE), caption = get_pwc_label(new))
Null and Alternative hypothesis
H0: There will be no significant prediction of total Green House Gases Emission Reductions in metric tons of carbon dioxide per the amount of AHSC GGRP Funding Requested by the Transit Green House Gases Emission Reduction Percentage.
HA: There is a relationship.
favstats(AHSC$Total_MTCO2e)
## min Q1 median Q3 max mean sd n
## 0.000176 0.0006025 0.000946 0.001374 0.002738 0.00102166 0.0005970589 47
## missing
## 0
favstats(AHSC$Transit_GHG_Emission_Reduction_Percentage)
## min Q1 median Q3 max mean sd
## -0.01427003 0.2307136 0.6376591 0.7882641 0.9225829 0.5282385 0.3286692
## n missing
## 47 0
favstats(AHSC$Total_MTCO2e)
## min Q1 median Q3 max mean sd n
## 0.000176 0.0006025 0.000946 0.001374 0.002738 0.00102166 0.0005970589 47
## missing
## 0
#Linear Regression between the two
LinearRegression <- lm(AHSC$Total_MTCO2e~AHSC$Transit_GHG_Emission_Reduction_Percentage)
summary(LinearRegression)
##
## Call:
## lm(formula = AHSC$Total_MTCO2e ~ AHSC$Transit_GHG_Emission_Reduction_Percentage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.387e-04 -2.786e-04 -3.583e-05 2.589e-04 1.228e-03
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 0.0003137 0.0001134
## AHSC$Transit_GHG_Emission_Reduction_Percentage 0.0013402 0.0001828
## t value Pr(>|t|)
## (Intercept) 2.767 0.00819 **
## AHSC$Transit_GHG_Emission_Reduction_Percentage 7.331 3.3e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0004075 on 45 degrees of freedom
## Multiple R-squared: 0.5443, Adjusted R-squared: 0.5342
## F-statistic: 53.75 on 1 and 45 DF, p-value: 3.298e-09
par(mfrow=c(1,2))
boxplot(AHSC$Total_MTCO2e, sub=paste("Outlier rows: ", boxplot.stats(AHSC$Total_MTCO2e)$out))
boxplot(AHSC$Transit_GHG_Emission_Reduction_Percentage, sub=paste("Outlier rows: ", boxplot.stats(AHSC$Transit_GHG_Emission_Reduction_Percentage)$out))
par(mfrow=c(2,2))
plot(LinearRegression)
Independence : Yes, the observations are independent of each other.
Homoscedasticity : In the residual vs fitter plot, you can see that the variance increases a bit at the end. However, for the most part, variance of residuals in the same for any value of X.
Linearity : The relationship between X and the mean of Y is linear, which can be seen in the Residual vs Fitted graph. The line is relatively flat.
Normality : You can see in the Normal QQ-Plot that for any fixed value of X, Y is normally distributed.
fit <- lm(Total_MTCO2e~Transit_GHG_Emission_Reduction_Percentage, data=AHSC)
plotly <- AHSC %>%
plot_ly(x=~Transit_GHG_Emission_Reduction_Percentage) %>%
add_markers(y= ~Total_MTCO2e, name="Project") %>%
add_lines(x=~Transit_GHG_Emission_Reduction_Percentage, y= fitted(fit), name="Linear Regression line")
plotly
#Estimating the correlation
cor(AHSC$Total_MTCO2e, AHSC$Transit_GHG_Emission_Reduction_Percentage)
## [1] 0.7377554
cor.test(AHSC$Total_MTCO2e, AHSC$Transit_GHG_Emission_Reduction_Percentage)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 7.3311, df = 45, p-value = 3.298e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5717106 0.8457439
## sample estimates:
## cor
## 0.7377554
confint(LinearRegression)
## 2.5 % 97.5 %
## (Intercept) 8.532305e-05 0.0005421026
## AHSC$Transit_GHG_Emission_Reduction_Percentage 9.720051e-04 0.0017084008