# Q2.1 Scatterplot of the dataset
HCI<-read.table("HCI.txt", header=TRUE)
summary(HCI)
##    Counrty          HumanCapitalIndex PercentageofImprovedSanitationCoverage
##  Length:155         Min.   :0.2900    Min.   :  7.079                       
##  Class :character   1st Qu.:0.4250    1st Qu.: 44.386                       
##  Mode  :character   Median :0.5600    Median : 88.290                       
##                     Mean   :0.5654    Mean   : 72.854                       
##                     3rd Qu.:0.6900    3rd Qu.: 98.024                       
##                     Max.   :0.8800    Max.   :100.000
plot(HCI$PercentageofImprovedSanitationCoverage,HCI$HumanCapitalIndex,xlab="Percentage of Improved Sanitation Coverage",ylab="Human Capital Index")

#Q2.2 Finding sample size n
n<-nrow(HCI)
cat("The sample size n is: ",n,"\n")
## The sample size n is:  155
#Q2.3 State the least-square criterion based on this real data set
x<-HCI$PercentageofImprovedSanitationCoverage
y<-HCI$HumanCapitalIndex
xbar<-mean(x)
ybar<-mean(y)
Sxx<-sum((x-xbar)^2)
Sxy<-sum(y*(x-xbar))
betahat<-Sxy/Sxx
betahat2<-ybar-betahat*xbar
cat("The least-square fit is: yˆ=",betahat2,"-",betahat,"x")
## The least-square fit is: yˆ= 0.26036 - 0.00418641 x
#Q2.4 Fit a SLR model with intercept based on least-squares estimators 
graph<-lm(HumanCapitalIndex~PercentageofImprovedSanitationCoverage,data=HCI)
summary(graph)
## 
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage, 
##     data = HCI)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.218965 -0.060953 -0.005689  0.073137  0.200999 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.260360   0.017884   14.56   <2e-16 ***
## PercentageofImprovedSanitationCoverage 0.004186   0.000227   18.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08481 on 153 degrees of freedom
## Multiple R-squared:  0.6898, Adjusted R-squared:  0.6878 
## F-statistic: 340.2 on 1 and 153 DF,  p-value: < 2.2e-16
plot(HCI$PercentageofImprovedSanitationCoverage,HCI$HumanCapitalIndex,abline(graph),xlab="Percentage of Improved Sanitation Coverage",ylab="Human Capital Index")

#Q2.5 Coefficient of determination
cat("The coefficient of determination is ",summary(graph)$r.squared)
## The coefficient of determination is  0.6897889
#The coefficient of determination tell us how well the prediction of the regression model, in the context, 0.6898 tell us the prediction of the model is mostly correct.


#Q2.6
#the slope for the least-squares estimator in the SLR model is 0.0042, the slope predict when the percentage of improved sanitation coverage increase, the human capital index increase

#Q2.7 The anova table is given by
anova(graph)
## Analysis of Variance Table
## 
## Response: HumanCapitalIndex
##                                         Df Sum Sq Mean Sq F value    Pr(>F)    
## PercentageofImprovedSanitationCoverage   1 2.4473 2.44727  340.21 < 2.2e-16 ***
## Residuals                              153 1.1006 0.00719                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q2.8 Unbiased estimator for σ^2 -> Sample Variance of Human Capital Index from the dataset
anova(graph)
## Analysis of Variance Table
## 
## Response: HumanCapitalIndex
##                                         Df Sum Sq Mean Sq F value    Pr(>F)    
## PercentageofImprovedSanitationCoverage   1 2.4473 2.44727  340.21 < 2.2e-16 ***
## Residuals                              153 1.1006 0.00719                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
var<-1.1006/153
cat("Unbiased estimator for σ^2 is ",var)
## Unbiased estimator for σ^2 is  0.007193464
#Q2.9 test significance of SLR model based on sample data by critical region method
#To test the significance of regression, F-test will be used
#H0:B1=0, H1:B1!=0 alpha=0.01
#By Anova, F0=340.21, from F-distribution table, F(0.01,1,153)=6.64, The p-value for this test is 2.2*10^(-16)
#Consequently, the H0 is rejected, and conclude that there is a linear relationship in the SLR model

#Q2.10 test SLR model without intercept by t-test
modelwithoutintercept<-lm(HumanCapitalIndex~PercentageofImprovedSanitationCoverage+0,data=HCI)
summary(modelwithoutintercept)
## 
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage + 
##     0, data = HCI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22031 -0.06302  0.03606  0.11625  0.33659 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## PercentageofImprovedSanitationCoverage 0.0072416  0.0001331   54.41   <2e-16
##                                           
## PercentageofImprovedSanitationCoverage ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1306 on 154 degrees of freedom
## Multiple R-squared:  0.9506, Adjusted R-squared:  0.9502 
## F-statistic:  2960 on 1 and 154 DF,  p-value: < 2.2e-16
#H0=0, H1!=0, alpha=0.05,residual d.f.=153, by t-test and summary function, t0=18.44, critical value t(0.025,153)=1.984, the decision is not to reject H0
#which shows model without intercept is not better than with intercept, because the model prediction is less precise

#Q2.11 Construct a 99% confidence interval for the mean response at x0 = 0.48.
summary(graph)
## 
## Call:
## lm(formula = HumanCapitalIndex ~ PercentageofImprovedSanitationCoverage, 
##     data = HCI)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.218965 -0.060953 -0.005689  0.073137  0.200999 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.260360   0.017884   14.56   <2e-16 ***
## PercentageofImprovedSanitationCoverage 0.004186   0.000227   18.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08481 on 153 degrees of freedom
## Multiple R-squared:  0.6898, Adjusted R-squared:  0.6878 
## F-statistic: 340.2 on 1 and 153 DF,  p-value: < 2.2e-16
#x0=0.48, t(0.005,153)=2.626
AA<-2.626*sqrt(var*((1/155)+(0.048-0.048)^2/Sxx))
#if x0=x¯=0.0048, the predicted mean = 0.2583505232
#The confidence level of 99% where x0=0.48 is
cat(0.2583505232-AA,"=< E(y|x0) =<",0.2583505232+AA)
## 0.240461 =< E(y|x0) =< 0.27624