# Q2.1 Scatterplot of the dataset
HCI<-read.table("HCI.txt", header=TRUE)
summary(HCI)
## Counrty HumanCapitalIndex PercentageofImprovedSanitationCoverage
## Length:155 Min. :0.2900 Min. : 7.079
## Class :character 1st Qu.:0.4250 1st Qu.: 44.386
## Mode :character Median :0.5600 Median : 88.290
## Mean :0.5654 Mean : 72.854
## 3rd Qu.:0.6900 3rd Qu.: 98.024
## Max. :0.8800 Max. :100.000
plot(HCI$PercentageofImprovedSanitationCoverage,HCI$HumanCapitalIndex,xlab="Percentage of Improved Sanitation Coverage",ylab="Human Capital Index")

#Q2.2 Finding sample size n
n<-nrow(HCI)
cat("The sample size n is: ",n,"\n")
## The sample size n is: 155
#Q2.3 State the least-square criterion based on this real data set
x<-HCI$PercentageofImprovedSanitationCoverage
y<-HCI$HumanCapitalIndex
xbar<-mean(x)
ybar<-mean(y)
Sxx<-sum((x-xbar)^2)
Sxy<-sum(y*(x-xbar))
betahat<-Sxy/Sxx
betahat2<-ybar-betahat*xbar
cat("The least-square fit is: yˆ=",betahat2,"-",betahat,"x")
## The least-square fit is: yˆ= 0.26036 - 0.00418641 x
#Q2.4 Fit a SLR model with intercept based on least-squares estimators
graph<-lm(HumanCapitalIndex~PercentageofImprovedSanitationCoverage,data=HCI)
summary(graph)
##
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage,
## data = HCI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.218965 -0.060953 -0.005689 0.073137 0.200999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.260360 0.017884 14.56 <2e-16 ***
## PercentageofImprovedSanitationCoverage 0.004186 0.000227 18.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08481 on 153 degrees of freedom
## Multiple R-squared: 0.6898, Adjusted R-squared: 0.6878
## F-statistic: 340.2 on 1 and 153 DF, p-value: < 2.2e-16
plot(HCI$PercentageofImprovedSanitationCoverage,HCI$HumanCapitalIndex,abline(graph),xlab="Percentage of Improved Sanitation Coverage",ylab="Human Capital Index")

#Q2.5 Coefficient of determination
cat("The coefficient of determination is ",summary(graph)$r.squared)
## The coefficient of determination is 0.6897889
#The coefficient of determination tell us how well the prediction of the regression model, in the context, 0.6898 tell us the prediction of the model is mostly correct.
#Q2.6
#the slope for the least-squares estimator in the SLR model is 0.0042, the slope predict when the percentage of improved sanitation coverage increase, the human capital index increase
#Q2.7 The anova table is given by
anova(graph)
## Analysis of Variance Table
##
## Response: HumanCapitalIndex
## Df Sum Sq Mean Sq F value Pr(>F)
## PercentageofImprovedSanitationCoverage 1 2.4473 2.44727 340.21 < 2.2e-16 ***
## Residuals 153 1.1006 0.00719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q2.8 Unbiased estimator for σ^2 -> Sample Variance of Human Capital Index from the dataset
anova(graph)
## Analysis of Variance Table
##
## Response: HumanCapitalIndex
## Df Sum Sq Mean Sq F value Pr(>F)
## PercentageofImprovedSanitationCoverage 1 2.4473 2.44727 340.21 < 2.2e-16 ***
## Residuals 153 1.1006 0.00719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
var<-1.1006/153
cat("Unbiased estimator for σ^2 is ",var)
## Unbiased estimator for σ^2 is 0.007193464
#Q2.9 test significance of SLR model based on sample data by critical region method
#To test the significance of regression, F-test will be used
#H0:B1=0, H1:B1!=0 alpha=0.01
#By Anova, F0=340.21, from F-distribution table, F(0.01,1,153)=6.64, The p-value for this test is 2.2*10^(-16)
#Consequently, the H0 is rejected, and conclude that there is a linear relationship in the SLR model
#Q2.10 test SLR model without intercept by t-test
modelwithoutintercept<-lm(HumanCapitalIndex~PercentageofImprovedSanitationCoverage+0,data=HCI)
summary(modelwithoutintercept)
##
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage +
## 0, data = HCI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22031 -0.06302 0.03606 0.11625 0.33659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## PercentageofImprovedSanitationCoverage 0.0072416 0.0001331 54.41 <2e-16
##
## PercentageofImprovedSanitationCoverage ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1306 on 154 degrees of freedom
## Multiple R-squared: 0.9506, Adjusted R-squared: 0.9502
## F-statistic: 2960 on 1 and 154 DF, p-value: < 2.2e-16
#H0=0, H1!=0, alpha=0.05,residual d.f.=153, by t-test and summary function, t0=18.44, critical value t(0.025,153)=1.984, the decision is not to reject H0
#which shows model without intercept is not better than with intercept, because the model prediction is less precise
#Q2.11 Construct a 99% confidence interval for the mean response at x0 = 0.48.
summary(graph)
##
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage,
## data = HCI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.218965 -0.060953 -0.005689 0.073137 0.200999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.260360 0.017884 14.56 <2e-16 ***
## PercentageofImprovedSanitationCoverage 0.004186 0.000227 18.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08481 on 153 degrees of freedom
## Multiple R-squared: 0.6898, Adjusted R-squared: 0.6878
## F-statistic: 340.2 on 1 and 153 DF, p-value: < 2.2e-16
#x0=0.48, t(0.005,153)=2.626
AA<-2.626*sqrt(var*((1/155)+(0.048-0.048)^2/Sxx))
#if x0=x¯=0.0048, the predicted mean = 0.2583505232
#The confidence level of 99% where x0=0.48 is
cat(0.2583505232-AA,"=< E(y|x0) =<",0.2583505232+AA)
## 0.240461 =< E(y|x0) =< 0.27624