Linear_regression and machine learning

library(DAAG)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gtsummary)
head(roller)

##   weight depression
## 1    1.9          2
## 2    3.1          1
## 3    3.3          5
## 4    4.8          5
## 5    5.3         20
## 6    6.1         20

dim(roller)

## [1] 10  2

p<-function(roller){sum(is.na(roller))/length(roller)*100}#missing value percentages
apply(roller,2,p)#no missing values

tail(roller)

##    weight depression
## 5     5.3         20
## 6     6.1         20
## 7     6.4         23
## 8     7.6         10
## 9     9.8         30
## 10   12.4         25

#normality test

shapiro.test(roller$weight)

## 
##  Shapiro-Wilk normality test
## 
## data:  roller$weight
## W = 0.95063, p-value = 0.676

shapiro.test(roller$depression)#has normal distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  roller$depression
## W = 0.8996, p-value = 0.2169

#scatter plot

ggplot(roller,aes(x=weight,y=depression))+geom_point()+geom_smooth(method = lm)+theme_minimal()+ggtitle('Depression Vs Weight','Source:Roller data from DAAG')

## `geom_smooth()` using formula = 'y ~ x'

#linear regression

model<-lm(depression~weight,data=roller)
summary(model)

## 
## Call:
## lm(formula = depression ~ weight, data = roller)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.180 -5.580 -1.346  5.920  8.020 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -2.0871     4.7543  -0.439  0.67227   
## weight        2.6667     0.7002   3.808  0.00518 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.735 on 8 degrees of freedom
## Multiple R-squared:  0.6445, Adjusted R-squared:  0.6001 
## F-statistic:  14.5 on 1 and 8 DF,  p-value: 0.005175

#model attributes

names(model)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

#model coefficents

coef(model)

## (Intercept)      weight 
##   -2.087148    2.666746

#residuals residuals are observed values minus fitted values

fitted.values(model)#fitted values

##         1         2         3         4         5         6         7         8 
##  2.979669  6.179765  6.713114 10.713233 12.046606 14.180002 14.980026 18.180121 
##         9        10 
## 24.046962 30.980502

residuals(model)

##          1          2          3          4          5          6          7 
## -0.9796695 -5.1797646 -1.7131138 -5.7132327  7.9533944  5.8199976  8.0199738 
##          8          9         10 
## -8.1801213  5.9530377 -5.9805017

roller[1,]#observed values

##   weight depression
## 1    1.9          2

#Diagnostics

par(mfrow=c(2,2))
plot(model)

#Predictions

#when weight is 9,12 and 14.What are Depressions
predict(model,data.frame(weight=c(9,12,14)),interval = 'confidence')

##        fit      lwr      upr
## 1 21.91357 15.09380 28.73333
## 2 29.91380 19.15205 40.67556
## 3 35.24730 21.53255 48.96204

#Prediction of confidence intervals

predict(model,data.frame(weight=c(9,12,14)),interval = 'prediction')

##        fit       lwr      upr
## 1 21.91357  4.950258 38.87687
## 2 29.91380 11.017771 48.80984
## 3 35.24730 14.526804 55.96779

p<-predict(model,interval = 'prediction')

## Warning in predict.lm(model, interval = "prediction"): predictions on current data refer to _future_ responses

data<-cbind(roller,p)
ggplot(data,aes(weight,depression))+geom_point()+geom_smooth(method='lm')+theme_minimal()+ggtitle('Depression Vs Weight','Source :Roller data from DAAG')+geom_line(aes(y=lwr),color='blue',linetype='dashed',lwd=2)+geom_line(aes(y=upr),color='blue',linetype='dashed',lwd=2
                                                                                                                            )

## `geom_smooth()` using formula = 'y ~ x'

#Quadratic

model1<-lm(depression~weight+I(weight^2),roller)
summary(model1)

## 
## Call:
## lm(formula = depression ~ weight + I(weight^2), data = roller)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.699  -3.192   1.244   4.792   6.163 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -12.1247     9.3821  -1.292   0.2373  
## weight        6.2337     2.9822   2.090   0.0749 .
## I(weight^2)  -0.2519     0.2051  -1.228   0.2590  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.531 on 7 degrees of freedom
## Multiple R-squared:  0.7075, Adjusted R-squared:  0.624 
## F-statistic: 8.467 on 2 and 7 DF,  p-value: 0.01353

#Machine learning

#data partitioning

ind<-sample(2,nrow(roller),replace=T,prob = c(0.8,02))
train<-roller[ind==1,]
test<-roller[ind==2,]
train

##   weight depression
## 2    3.1          1

model

model<-lm(depression~weight,train)
summary(model)

## 
## Call:
## lm(formula = depression ~ weight, data = train)
## 
## Residuals:
## ALL 1 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1        NaN     NaN      NaN
## weight            NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom

prediction

pred<-predict(model,test)

## Warning in predict.lm(model, test): prediction from rank-deficient fit; attr(*,
## "non-estim") has doubtful cases

head(pred)

## 1 3 4 5 6 7 
## 1 1 1 1 1 1

head(test$weight)

## [1] 1.9 3.3 4.8 5.3 6.1 6.4

Linear_regression and machine learning

mugo_muiruri_james

2024-02-02