library(DAAG)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gtsummary)
head(roller)
## weight depression
## 1 1.9 2
## 2 3.1 1
## 3 3.3 5
## 4 4.8 5
## 5 5.3 20
## 6 6.1 20
dim(roller)
## [1] 10 2
p<-function(roller){sum(is.na(roller))/length(roller)*100}#missing value percentages
apply(roller,2,p)#no missing values
tail(roller)
## weight depression
## 5 5.3 20
## 6 6.1 20
## 7 6.4 23
## 8 7.6 10
## 9 9.8 30
## 10 12.4 25
#normality test
shapiro.test(roller$weight)
##
## Shapiro-Wilk normality test
##
## data: roller$weight
## W = 0.95063, p-value = 0.676
shapiro.test(roller$depression)#has normal distribution
##
## Shapiro-Wilk normality test
##
## data: roller$depression
## W = 0.8996, p-value = 0.2169
#scatter plot
ggplot(roller,aes(x=weight,y=depression))+geom_point()+geom_smooth(method = lm)+theme_minimal()+ggtitle('Depression Vs Weight','Source:Roller data from DAAG')
## `geom_smooth()` using formula = 'y ~ x'
#linear regression
model<-lm(depression~weight,data=roller)
summary(model)
##
## Call:
## lm(formula = depression ~ weight, data = roller)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.180 -5.580 -1.346 5.920 8.020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.0871 4.7543 -0.439 0.67227
## weight 2.6667 0.7002 3.808 0.00518 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.735 on 8 degrees of freedom
## Multiple R-squared: 0.6445, Adjusted R-squared: 0.6001
## F-statistic: 14.5 on 1 and 8 DF, p-value: 0.005175
#model attributes
names(model)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
#model coefficents
coef(model)
## (Intercept) weight
## -2.087148 2.666746
#residuals residuals are observed values minus fitted values
fitted.values(model)#fitted values
## 1 2 3 4 5 6 7 8
## 2.979669 6.179765 6.713114 10.713233 12.046606 14.180002 14.980026 18.180121
## 9 10
## 24.046962 30.980502
residuals(model)
## 1 2 3 4 5 6 7
## -0.9796695 -5.1797646 -1.7131138 -5.7132327 7.9533944 5.8199976 8.0199738
## 8 9 10
## -8.1801213 5.9530377 -5.9805017
roller[1,]#observed values
## weight depression
## 1 1.9 2
#Diagnostics
par(mfrow=c(2,2))
plot(model)
#Predictions
#when weight is 9,12 and 14.What are Depressions
predict(model,data.frame(weight=c(9,12,14)),interval = 'confidence')
## fit lwr upr
## 1 21.91357 15.09380 28.73333
## 2 29.91380 19.15205 40.67556
## 3 35.24730 21.53255 48.96204
#Prediction of confidence intervals
predict(model,data.frame(weight=c(9,12,14)),interval = 'prediction')
## fit lwr upr
## 1 21.91357 4.950258 38.87687
## 2 29.91380 11.017771 48.80984
## 3 35.24730 14.526804 55.96779
p<-predict(model,interval = 'prediction')
## Warning in predict.lm(model, interval = "prediction"): predictions on current data refer to _future_ responses
data<-cbind(roller,p)
ggplot(data,aes(weight,depression))+geom_point()+geom_smooth(method='lm')+theme_minimal()+ggtitle('Depression Vs Weight','Source :Roller data from DAAG')+geom_line(aes(y=lwr),color='blue',linetype='dashed',lwd=2)+geom_line(aes(y=upr),color='blue',linetype='dashed',lwd=2
)
## `geom_smooth()` using formula = 'y ~ x'
#Quadratic
model1<-lm(depression~weight+I(weight^2),roller)
summary(model1)
##
## Call:
## lm(formula = depression ~ weight + I(weight^2), data = roller)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.699 -3.192 1.244 4.792 6.163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.1247 9.3821 -1.292 0.2373
## weight 6.2337 2.9822 2.090 0.0749 .
## I(weight^2) -0.2519 0.2051 -1.228 0.2590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.531 on 7 degrees of freedom
## Multiple R-squared: 0.7075, Adjusted R-squared: 0.624
## F-statistic: 8.467 on 2 and 7 DF, p-value: 0.01353
#Machine learning
#data partitioning
ind<-sample(2,nrow(roller),replace=T,prob = c(0.8,02))
train<-roller[ind==1,]
test<-roller[ind==2,]
train
## weight depression
## 2 3.1 1
model
model<-lm(depression~weight,train)
summary(model)
##
## Call:
## lm(formula = depression ~ weight, data = train)
##
## Residuals:
## ALL 1 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1 NaN NaN NaN
## weight NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
prediction
pred<-predict(model,test)
## Warning in predict.lm(model, test): prediction from rank-deficient fit; attr(*,
## "non-estim") has doubtful cases
head(pred)
## 1 3 4 5 6 7
## 1 1 1 1 1 1
head(test$weight)
## [1] 1.9 3.3 4.8 5.3 6.1 6.4