I am going to do machine learning using linear regression alogarithm .

I will do the following:

1.0 Explore then split the data into training and test data

2.0 Make the model

3.0 Do prediction

4.0 Check at RMSE

load appropriate package

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.3     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.4     ✔ tidyr        1.3.0
## ✔ infer        1.0.5     ✔ tune         1.1.2
## ✔ modeldata    1.2.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.2.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(123)
data("airquality")
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

I am using wind and Temp variables

#scatter plot

f<-airquality%>%ggplot(aes(x=Temp,y=Wind))+geom_point()+geom_smooth(method='lm',se=F)+theme_minimal()
f

## `geom_smooth()` using formula = 'y ~ x'

#there is linear relationship between wind and temp ,so linear regression is possible.

split data into train and test

split<-initial_split(airquality)
train<-training(split)
test<-testing(split)

Forming the model

lm<-lm(Temp~Wind,data=train)

explore the model

names(lm)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

further explore

lm$coefficients

## (Intercept)        Wind 
##   89.556989   -1.168743

#lm$residuals
#lm$fitted.values

summary of the model

summary(lm)

## 
## Call:
## lm(formula = Temp ~ Wind, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.207  -5.214   1.793   5.734  18.857 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  89.5570     2.3938  37.413  < 2e-16 ***
## Wind         -1.1687     0.2257  -5.179 9.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.619 on 112 degrees of freedom
## Multiple R-squared:  0.1932, Adjusted R-squared:  0.186 
## F-statistic: 26.82 on 1 and 112 DF,  p-value: 9.936e-07

confidence intervals of the model

confint(lm,level = .95)

##                 2.5 %     97.5 %
## (Intercept) 84.814053 94.2999252
## Wind        -1.615879 -0.7216066

check assumptions of linear regression

plot(lm)

# assumptions of linear regression not violated

froming a data frame

tidy(lm)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    89.6      2.39      37.4  3.97e-65
## 2 Wind           -1.17     0.226     -5.18 9.94e- 7

glance(lm)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic     p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>       <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.193         0.186  8.62      26.8 0.000000994     1  -406.  819.  827.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

forming auqment data frame

au<-augment(lm)
au

## # A tibble: 114 × 8
##     Temp  Wind .fitted .resid    .hat .sigma  .cooksd .std.resid
##    <int> <dbl>   <dbl>  <dbl>   <dbl>  <dbl>    <dbl>      <dbl>
##  1    68  10.9    76.8  -8.82 0.00934   8.62 0.00498      -1.03 
##  2    73  11.5    76.1  -3.12 0.0103    8.65 0.000690     -0.363
##  3    86   8      80.2   5.79 0.0115    8.64 0.00265       0.676
##  4    92   9.2    78.8  13.2  0.00920   8.57 0.0110        1.54 
##  5    68  11.5    76.1  -8.12 0.0103    8.62 0.00468      -0.947
##  6    75  14.3    72.8   2.16 0.0215    8.66 0.000704      0.253
##  7    86   7.4    80.9   5.09 0.0134    8.64 0.00239       0.595
##  8    83   7.4    80.9   2.09 0.0134    8.66 0.000404      0.244
##  9    81  10.3    77.5   3.48 0.00884   8.65 0.000734      0.406
## 10    81   9.2    78.8   2.20 0.00920   8.66 0.000304      0.256
## # ℹ 104 more rows

model accuracy

pred<-predict(lm,test,interval="confidence")
head(pred)

##        fit      lwr      upr
## 1 80.90829 78.93461 82.88198
## 2 80.20705 78.37757 82.03652
## 3 74.83083 72.84977 76.81189
## 4 72.84397 70.33811 75.34983
## 5 81.49266 79.38022 83.60511
## 6 68.05212 63.96400 72.14024

prediction<-predict(lm,test,interval="prediction")
head(prediction)

##        fit      lwr      upr
## 1 80.90829 63.71649 98.10009
## 2 80.20705 63.03121 97.38289
## 3 74.83083 57.63818 92.02348
## 4 72.84397 55.58297 90.10496
## 5 81.49266 64.28438 98.70095
## 6 68.05212 50.49150 85.61274

RMSE—ACCURACY

RMSE<-sqrt(mean((test$Temp-predict(lm,test))^2))
RMSE

## [1] 7.922798

MACHINE_LEARNING_linear_regression

mugo

23 April 2024