#The work reported in this study explores and discuss extensively on a public dataset gotten from The Spark Foundation. The study was preceded by an elaborate and up-to-date analysis using the R programming language. This is a simple linear regression task as it involves just 2 variables. Data can be found at http://bit.ly/w-data
library(tidyverse)
library(readr)
data <- read.csv("C:/Users/HP PC/Downloads/Students Data.csv")
View(data)
head(data)
## Hours Scores
## 1 2.5 21
## 2 5.1 47
## 3 3.2 27
## 4 8.5 75
## 5 3.5 30
## 6 1.5 20
model<- lm(Scores~Hours,data = data)
model
##
## Call:
## lm(formula = Scores ~ Hours, data = data)
##
## Coefficients:
## (Intercept) Hours
## 2.484 9.776
#we do this by installing and loading our ggplot2 package
library(ggplot2)
ggplot(data= data,aes(x=Hours,y=Scores))+geom_point() + ggtitle('Distribution of Scores')+
geom_point(color='red')
#From the graph above, we can clearly see that there is a positive linear relation between the number of hours studied and percentage of score.
ggplot(data= data,aes(x=Hours,y=Scores))+geom_point() + ggtitle('Distribution of Scores')+
geom_point(color='red')+ geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
predicted_scores <-predict(model, data = data$Scores)
predicted_scores
## 1 2 3 4 5 6 7 8
## 26.92318 52.34027 33.76624 85.57800 36.69899 17.14738 92.42106 56.25059
## 9 10 11 12 13 14 15 16
## 83.62284 28.87834 77.75736 60.16091 46.47479 34.74382 13.23706 89.48832
## 17 18 19 20 21 22 23 24
## 26.92318 21.05770 62.11607 74.82462 28.87834 49.40753 39.63173 69.93672
## 25
## 78.73494
data$prediction <- predicted_scores
View(data)
head(data)
## Hours Scores prediction
## 1 2.5 21 26.92318
## 2 5.1 47 52.34027
## 3 3.2 27 33.76624
## 4 8.5 75 85.57800
## 5 3.5 30 36.69899
## 6 1.5 20 17.14738
cor.test(data$Scores,predicted_scores)
##
## Pearson's product-moment correlation
##
## data: data$Scores and predicted_scores
## t = 21.583, df = 23, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9459248 0.9896072
## sample estimates:
## cor
## 0.9761907
#both data have an extimated corrolation of 0.976. now lets visualize this.
ggplot(data= data,aes(x=predicted_scores, y=Scores))+geom_point() + ggtitle('Distribution of Scores')+
geom_point(color='darkblue')
library(caret)
set.seed(123)
training.samples <- data$Scores %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- data[training.samples, ]
test.data <- data[-training.samples, ]
model_train <- lm(Scores ~., data = train.data)
summary(model_train)
##
## Call:
## lm(formula = Scores ~ ., data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.885 -5.164 1.811 4.851 7.738
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5005 2.8634 0.524 0.606
## Hours 9.9276 0.5125 19.372 1.98e-14 ***
## prediction NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.839 on 20 degrees of freedom
## Multiple R-squared: 0.9494, Adjusted R-squared: 0.9469
## F-statistic: 375.3 on 1 and 20 DF, p-value: 1.983e-14
#In our example, it can be seen that p-value of the F-statistic is < 1.983e-14, which is highly significant. This means that, at least, one of the predictor variables is significantly related to the outcome variable.
predictions <- model_train %>% predict(test.data)
## Warning in predict.lm(., test.data): prediction from a rank-deficient fit may be
## misleading
RMSE(predictions, test.data$Scores)
## [1] 3.869503
#Root mean square error is 3.8695
R2(predictions, test.data$Scores)
## [1] 0.9931043
#R-square value is 0.9931 which is very good.
#lastly, we answer our stakeholderas question, i.e.predicting the score if a student studies for 9.25 hrs/ day?
X <-data.frame(Hours=9.25)
result<-predict(model,X)
print(result)
## 1
## 92.90985
#if a student studies for 9.25hours, he/she is likely to score 92.91
#Next we evaluate the model by calculating the Mean Absolute Error between our predicted scores and observed scores. we do this by first installing and loading the Metrics package.
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
mae(data$Scores, predicted_scores)
## [1] 4.972805
#MAE for this model is 4.973