I am going to do machine learning using linear regression alogarithm .
I will do the following:
1.0 Explore then split the data into training and test data
2.0 Make the model
3.0 Do prediction
4.0 Check at RMSE
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.8
## ✔ dials 1.2.0 ✔ rsample 1.2.0
## ✔ dplyr 1.1.3 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.4 ✔ tidyr 1.3.0
## ✔ infer 1.0.5 ✔ tune 1.1.2
## ✔ modeldata 1.2.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.2 ✔ yardstick 1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
data("airquality")
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
#scatter plot
f<-airquality%>%ggplot(aes(x=Temp,y=Wind))+geom_point()+geom_smooth(method='lm',se=F)+theme_minimal()
f
## `geom_smooth()` using formula = 'y ~ x'
#there is linear relationship between wind and temp ,so linear regression is possible.
split<-initial_split(airquality)
train<-training(split)
test<-testing(split)
lm<-lm(Temp~Wind,data=train)
names(lm)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
lm$coefficients
## (Intercept) Wind
## 89.556989 -1.168743
#lm$residuals
#lm$fitted.values
summary(lm)
##
## Call:
## lm(formula = Temp ~ Wind, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.207 -5.214 1.793 5.734 18.857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.5570 2.3938 37.413 < 2e-16 ***
## Wind -1.1687 0.2257 -5.179 9.94e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.619 on 112 degrees of freedom
## Multiple R-squared: 0.1932, Adjusted R-squared: 0.186
## F-statistic: 26.82 on 1 and 112 DF, p-value: 9.936e-07
confint(lm,level = .95)
## 2.5 % 97.5 %
## (Intercept) 84.814053 94.2999252
## Wind -1.615879 -0.7216066
plot(lm)
# assumptions of linear regression not violated
tidy(lm)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 89.6 2.39 37.4 3.97e-65
## 2 Wind -1.17 0.226 -5.18 9.94e- 7
glance(lm)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.193 0.186 8.62 26.8 0.000000994 1 -406. 819. 827.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
au<-augment(lm)
au
## # A tibble: 114 × 8
## Temp Wind .fitted .resid .hat .sigma .cooksd .std.resid
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 68 10.9 76.8 -8.82 0.00934 8.62 0.00498 -1.03
## 2 73 11.5 76.1 -3.12 0.0103 8.65 0.000690 -0.363
## 3 86 8 80.2 5.79 0.0115 8.64 0.00265 0.676
## 4 92 9.2 78.8 13.2 0.00920 8.57 0.0110 1.54
## 5 68 11.5 76.1 -8.12 0.0103 8.62 0.00468 -0.947
## 6 75 14.3 72.8 2.16 0.0215 8.66 0.000704 0.253
## 7 86 7.4 80.9 5.09 0.0134 8.64 0.00239 0.595
## 8 83 7.4 80.9 2.09 0.0134 8.66 0.000404 0.244
## 9 81 10.3 77.5 3.48 0.00884 8.65 0.000734 0.406
## 10 81 9.2 78.8 2.20 0.00920 8.66 0.000304 0.256
## # ℹ 104 more rows
pred<-predict(lm,test,interval="confidence")
head(pred)
## fit lwr upr
## 1 80.90829 78.93461 82.88198
## 2 80.20705 78.37757 82.03652
## 3 74.83083 72.84977 76.81189
## 4 72.84397 70.33811 75.34983
## 5 81.49266 79.38022 83.60511
## 6 68.05212 63.96400 72.14024
prediction<-predict(lm,test,interval="prediction")
head(prediction)
## fit lwr upr
## 1 80.90829 63.71649 98.10009
## 2 80.20705 63.03121 97.38289
## 3 74.83083 57.63818 92.02348
## 4 72.84397 55.58297 90.10496
## 5 81.49266 64.28438 98.70095
## 6 68.05212 50.49150 85.61274
RMSE<-sqrt(mean((test$Temp-predict(lm,test))^2))
RMSE
## [1] 7.922798