Ling Hou (s3637388), Scott Keene (s3686673), Kamalpreet Khangura (s3688108)
Last updated: 21 October, 2017
Dataset: https://www.kaggle.com/unsdsn/world-happiness
# Read the data file from the project folder.
Dataset_for_Assignment_4 <- read_csv("Dataset for Assignment 4.csv")Happiness_Score_Summary <- Dataset_for_Assignment_4 %>%
summarise(
Min = min(Happiness.Score,na.rm = TRUE),
Q1 = quantile(Happiness.Score,probs = .25,na.rm = TRUE),
Median = median(Happiness.Score, na.rm = TRUE),
Q3 = quantile(Happiness.Score,probs = .75,na.rm = TRUE),
Max = max(Happiness.Score,na.rm = TRUE),
Mean = mean(Happiness.Score, na.rm = TRUE),
SD = sd(Happiness.Score, na.rm = TRUE),
n = n())
Happiness_Score_SummaryHappiness_Score_Outliers <- Dataset_for_Assignment_4 %>% summarise(
LowerFence = (quantile(Happiness.Score,probs = .25)-1.5*IQR(Happiness.Score)) %>% round(1),
UpperFence = (quantile(Happiness.Score,probs = .75)+1.5*IQR(Happiness.Score)) %>% round(1),
LowerOutlierCount = sum(Happiness.Score < LowerFence) %>% round(0),
UpperOutlierCount = sum(Happiness.Score > UpperFence) %>% round(0),
SD_MinToMean = ((mean(Happiness.Score) - min(Happiness.Score)) / sd(Happiness.Score)) %>% round(1),
SD_MeanToMax = ((max(Happiness.Score) - mean(Happiness.Score)) / sd(Happiness.Score)) %>% round(1))
Happiness_Score_Outliers```
Dataset_for_Assignment_4$Happiness.Score %>% qqPlot(dist="norm")Wealth_Score_Summary <- Dataset_for_Assignment_4 %>%
summarise(
Min = min(Economy..GDP.per.Capita.,na.rm = TRUE),
Q1 = quantile(Economy..GDP.per.Capita.,probs = .25,na.rm = TRUE),
Median = median(Economy..GDP.per.Capita., na.rm = TRUE),
Q3 = quantile(Economy..GDP.per.Capita.,probs = .75,na.rm = TRUE),
Max = max(Economy..GDP.per.Capita.,na.rm = TRUE),
Mean = mean(Economy..GDP.per.Capita., na.rm = TRUE),
SD = sd(Economy..GDP.per.Capita., na.rm = TRUE),
n = n())
Wealth_Score_Summary Wealth_Score_Outliers <- Dataset_for_Assignment_4 %>% summarise(
LowerFence = (quantile(Economy..GDP.per.Capita.,probs = .25)-1.5*IQR(Economy..GDP.per.Capita.)) %>% round(1),
LowerOutlierCount = sum(Economy..GDP.per.Capita. < LowerFence) %>% round(0),
NumSD_MinToMean = ((mean(Economy..GDP.per.Capita.) - min(Economy..GDP.per.Capita.)) / sd(Economy..GDP.per.Capita.)) %>% round(1),
UpperFence = (quantile(Economy..GDP.per.Capita.,probs = .75)+1.5*IQR(Economy..GDP.per.Capita.)) %>% round(1),
UpperOutlierCount = sum(Economy..GDP.per.Capita. > UpperFence) %>% round(0),
NumSD_MeanToMax = ((max(Economy..GDP.per.Capita.) - mean(Economy..GDP.per.Capita.)) / sd(Economy..GDP.per.Capita.)) %>% round(1))
Wealth_Score_Outlierst.test(Dataset_for_Assignment_4$Economy..GDP.per.Capita., Dataset_for_Assignment_4$Happiness.Score,
paired = TRUE,
alternative = "two.sided")##
## Paired t-test
##
## data: Dataset_for_Assignment_4$Economy..GDP.per.Capita. and Dataset_for_Assignment_4$Happiness.Score
## t = -65.809, df = 154, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.500461 -4.238141
## sample estimates:
## mean of the differences
## -4.369301
Dataset_for_Assignment_4$Economy..GDP.per.Capita. %>% qqPlot(dist="norm")boxplot(
Dataset_for_Assignment_4$Economy..GDP.per.Capita.,
Dataset_for_Assignment_4$Happiness.Score,
ylab = "Scores Ratings",
xlab = "Wealth and Happiness"
)
axis(1, at = 1:2, labels = c("Wealth", "Happiness"))matplot(t(data.frame(Dataset_for_Assignment_4$Economy..GDP.per.Capita., Dataset_for_Assignment_4$Happiness.Score)),
type = "b",
pch = 19,
col = 1,
lty = 1,
xlab = "Comparison",
ylab = "Score rating",
xaxt = "n"
)
axis(1, at = 1:2, labels = c("Wealth Score", "Happiness Score")) From the matplot we can clearly visualise that it is a positive increase relationship between Economy and Happiness. If wealth socre or happiness score increase, another one will increase as well.
granova.ds(
data.frame(Dataset_for_Assignment_4$Economy..GDP.per.Capita., Dataset_for_Assignment_4$Happiness.Score),
xlab = "Wealth Score",
ylab = "Happiness Score"
)## Summary Stats
## n 155.000
## mean(x) 0.985
## mean(y) 5.354
## mean(D=x-y) -4.369
## SD(D) 0.827
## ES(D) -5.286
## r(x,y) 0.812
## r(x+y,d) -0.893
## LL 95%CI -4.500
## UL 95%CI -4.238
## t(D-bar) -65.809
## df.t 154.000
## pval.t 0.000
A paired-samples tt-test was used to test for a significant mean difference between scores levels of economy and happiness. The mean difference following exercise was found to be -4.37 (SD = 0.827). Visual inspection of the Q-Q plot of the difference scores suggested that the Happiness.Score was approximately normally distributed, but Economy..GDP.per.Capita was not so clear. The paired-samples tt-test found a statistically significant mean difference between stress levels before and after exercise, t(df=154)=???65.8, p<0.05, 95% [ -4.500461 -4.238141]. Happiness scores were found to be significantly increased after wealth score increased.
knitr:kable function to print nice HTML tables. Here is an example R code:y2 <- Dataset_for_Assignment_4$Happiness.Score^2
x2 <- Dataset_for_Assignment_4$Economy..GDP.per.Capita.^2
xy <- Dataset_for_Assignment_4$Happiness.Score*Dataset_for_Assignment_4$Economy..GDP.per.Capita.
sum_x <- sum(Dataset_for_Assignment_4$Economy..GDP.per.Capita.)
sum_y <- sum(Dataset_for_Assignment_4$Happiness.Score)
sum_x_sq <- sum(Dataset_for_Assignment_4$Economy..GDP.per.Capita.^2)
sum_y_sq <- sum(Dataset_for_Assignment_4$Happiness.Score^2)
sum_xy <- sum(Dataset_for_Assignment_4$Happiness.Score*Dataset_for_Assignment_4$Economy..GDP.per.Capita.)
n <- length(Dataset_for_Assignment_4$Economy..GDP.per.Capita.) #Sample size
Lxx <- sum_x_sq-((sum_x^2)/n)
Lyy <- sum_y_sq-((sum_y^2)/n)
Lxy = sum_xy - (((sum_x)*(sum_y))/n)
b = Lxy/Lxx
a = mean(Dataset_for_Assignment_4$Economy..GDP.per.Capita. - b*mean(Dataset_for_Assignment_4$Happiness.Score))
plot(Economy..GDP.per.Capita. ~ Happiness.Score, data = Dataset_for_Assignment_4, xlab = "Happiness Score", ylab = "Economy Score")
abline(a = a, b = b, col= "red")
abline(lm(Dataset_for_Assignment_4$Economy..GDP.per.Capita. ~ Dataset_for_Assignment_4$Happiness.Score))HapEconmodel <- lm( Economy..GDP.per.Capita. ~ Happiness.Score, data = Dataset_for_Assignment_4)
HapEconmodel %>% summary()##
## Call:
## lm(formula = Economy..GDP.per.Capita. ~ Happiness.Score, data = Dataset_for_Assignment_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90072 -0.16663 0.00354 0.16685 0.61731
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63338 0.09593 -6.603 6.27e-10 ***
## Happiness.Score 0.30222 0.01753 17.238 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2461 on 153 degrees of freedom
## Multiple R-squared: 0.6601, Adjusted R-squared: 0.6579
## F-statistic: 297.1 on 1 and 153 DF, p-value: < 2.2e-16
R2 <- (b*Lxy)/Lyy
R2## [1] 0.6601055
H0:The data do not fit the linear regression model
pf(q = 297.1,1,153,lower.tail = FALSE)## [1] 1.117922e-37
(R2/(1-R2)*(153/1))## [1] 297.1396
HapEconmodel %>% anova()We confirm the pp-value reported in the summary to be p<.001. As p-value is less than the 0.05 level of significance, we reject H0. There was statistically significant evidence that the data fit a linear regression model.
HapEconmodel %>% summary() %>% coef()## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6333762 0.09592844 -6.602591 6.271247e-10
## Happiness.Score 0.3022205 0.01753249 17.237739 1.110391e-37
The intercept/constant is reported as a=-0.633.This value represents the average Economy..GDP.per.Capita score when happiness score is equal to 0. To test the statistical significance of the constant, we set the following statistical hypotheses: H0:=0 HA:<>0 This hypothesis is tested using a t statistic, reported as t=-6.6026, p<.001. The constant is statistically significant at the 0.05 level. This means that there is statistically significant evidence that the constant is not 0.
HapEconmodel %>% confint()## 2.5 % 97.5 %
## (Intercept) -0.8228915 -0.4438609
## Happiness.Score 0.2675835 0.3368575
R reports the 95% CI for a to be [-0.8228915, -0.4438609]. H0:??=0 is clearly not captured by this interval, so was rejected.
H0:=0 HA:<>0 The slope of the regression line was reported as b=0.302.A one unit increase in Happiness Score was related to an average increase in Economy..GDP.per.Capita of .302 units. This is a positive change. We confirm that p<.001. As p<.05, we reject H0. There was statistically significant evidence that Happiness Score was positively related to Economy..GDP.per.Capita.
plot(HapEconmodel)