=function(){
r_startrm(list=ls(all.names=T))
setwd("d:/met/met_lect4")
set.seed(123)
if(!require(pacman)) {
install.packages("pacman")
}
library(pacman)
}=function(){
r_end
rm(list=ls(all.names = T))
}
descriptive statistics & linear regression using r
lecture-3
loading r libraries & udf
descriptive statistics
#|label: "section-2"
r_start()
Loading required package: pacman
p_load(tidyverse,janitor,psych,DescTools,kableExtra)
p_loaded()
[1] "kableExtra" "DescTools" "psych" "janitor" "lubridate"
[6] "forcats" "stringr" "dplyr" "purrr" "readr"
[11] "tidyr" "tibble" "ggplot2" "tidyverse" "pacman"
=iris|>janitor::clean_names()
dhead(d)
sepal_length | sepal_width | petal_length | petal_width | species |
---|---|---|---|---|
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3.0 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5.0 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
tail(d)
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
145 | 6.7 | 3.3 | 5.7 | 2.5 | virginica |
146 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
147 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
148 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
149 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
150 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
str(d)
'data.frame': 150 obs. of 5 variables:
$ sepal_length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ sepal_width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ petal_length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(d)
sepal_length sepal_width petal_length petal_width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
species
setosa :50
versicolor:50
virginica :50
::describe(d) psych
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sepal_length | 1 | 150 | 5.843333 | 0.8280661 | 5.80 | 5.808333 | 1.03782 | 4.3 | 7.9 | 3.6 | 0.3086407 | -0.6058125 | 0.0676113 |
sepal_width | 2 | 150 | 3.057333 | 0.4358663 | 3.00 | 3.043333 | 0.44478 | 2.0 | 4.4 | 2.4 | 0.3126147 | 0.1387047 | 0.0355883 |
petal_length | 3 | 150 | 3.758000 | 1.7652982 | 4.35 | 3.760000 | 1.85325 | 1.0 | 6.9 | 5.9 | -0.2694109 | -1.4168574 | 0.1441360 |
petal_width | 4 | 150 | 1.199333 | 0.7622377 | 1.30 | 1.184167 | 1.03782 | 0.1 | 2.5 | 2.4 | -0.1009166 | -1.3581792 | 0.0622364 |
species* | 5 | 150 | 2.000000 | 0.8192319 | 2.00 | 2.000000 | 1.48260 | 1.0 | 3.0 | 2.0 | 0.0000000 | -1.5199333 | 0.0668900 |
::Desc(d,main="desctool library of r") DescTools
------------------------------------------------------------------------------
Describe d (data.frame):
data frame: 150 obs. of 5 variables
150 complete cases (100.0%)
Nr ColName Class NAs Levels
1 sepal_length numeric .
2 sepal_width numeric .
3 petal_length numeric .
4 petal_width numeric .
5 species factor . (3): 1-setosa, 2-versicolor, 3-virginica
------------------------------------------------------------------------------
desctool library of r
length n NAs unique 0s mean meanCI'
150 150 0 35 0 5.843 5.710
100.0% 0.0% 0.0% 5.977
.05 .10 .25 median .75 .90 .95
4.600 4.800 5.100 5.800 6.400 6.900 7.255
range sd vcoef mad IQR skew kurt
3.600 0.828 0.142 1.038 1.300 0.309 -0.606
lowest : 4.3, 4.4 (3), 4.5, 4.6 (4), 4.7 (2)
highest: 7.3, 7.4, 7.6, 7.7 (4), 7.9
' 95%-CI (classic)
------------------------------------------------------------------------------
desctool library of r
length n NAs unique 0s mean meanCI'
150 150 0 23 0 3.057 2.987
100.0% 0.0% 0.0% 3.128
.05 .10 .25 median .75 .90 .95
2.345 2.500 2.800 3.000 3.300 3.610 3.800
range sd vcoef mad IQR skew kurt
2.400 0.436 0.143 0.445 0.500 0.313 0.139
lowest : 2.0, 2.2 (3), 2.3 (4), 2.4 (3), 2.5 (8)
highest: 3.9 (2), 4.0, 4.1, 4.2, 4.4
heap(?): remarkable frequency (17.3%) for the mode(s) (= 3)
' 95%-CI (classic)
------------------------------------------------------------------------------
desctool library of r
length n NAs unique 0s mean meanCI'
150 150 0 43 0 3.76 3.47
100.0% 0.0% 0.0% 4.04
.05 .10 .25 median .75 .90 .95
1.30 1.40 1.60 4.35 5.10 5.80 6.10
range sd vcoef mad IQR skew kurt
5.90 1.77 0.47 1.85 3.50 -0.27 -1.42
lowest : 1.0, 1.1, 1.2 (2), 1.3 (7), 1.4 (13)
highest: 6.3, 6.4, 6.6, 6.7 (2), 6.9
heap(?): remarkable frequency (8.7%) for the mode(s) (= 1.4, 1.5)
' 95%-CI (classic)
------------------------------------------------------------------------------
desctool library of r
length n NAs unique 0s mean meanCI'
150 150 0 22 0 1.20 1.08
100.0% 0.0% 0.0% 1.32
.05 .10 .25 median .75 .90 .95
0.20 0.20 0.30 1.30 1.80 2.20 2.30
range sd vcoef mad IQR skew kurt
2.40 0.76 0.64 1.04 1.50 -0.10 -1.36
lowest : 0.1 (5), 0.2 (29), 0.3 (7), 0.4 (7), 0.5
highest: 2.1 (6), 2.2 (3), 2.3 (8), 2.4 (3), 2.5 (3)
heap(?): remarkable frequency (19.3%) for the mode(s) (= 0.2)
' 95%-CI (classic)
------------------------------------------------------------------------------
desctool library of r
length n NAs unique levels dupes
150 150 0 3 3 y
100.0% 0.0%
level freq perc cumfreq cumperc
1 setosa 50 33.3% 50 33.3%
2 versicolor 50 33.3% 100 66.7%
3 virginica 50 33.3% 150 100.0%
r_end()
linear regression
#|label: "section-3"
=10
n=1:n
x=23.67*x+54.78+runif(n,-50,50)
y=data.frame(x,y)
d d
x | y |
---|---|
1 | 57.20775 |
2 | 130.95051 |
3 | 116.68769 |
4 | 187.76174 |
5 | 217.17673 |
6 | 151.35565 |
7 | 223.28055 |
8 | 283.38190 |
9 | 272.95350 |
10 | 287.14147 |
checking correlation
cor(d$x,d$y)
[1] 0.9259506
fitting values
=lm(y~x,d);mylm;summary(mylm) mylm
Call:
lm(formula = y ~ x, data = d)
Coefficients:
(Intercept) x
61.54 23.86
Call:
lm(formula = y ~ x, data = d)
Residuals:
Min 1Q Median 3Q Max
-53.366 -15.590 -4.334 28.498 36.319
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 61.535 21.352 2.882 0.02045 *
x 23.864 3.441 6.935 0.00012 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 31.26 on 8 degrees of freedom
Multiple R-squared: 0.8574, Adjusted R-squared: 0.8396
F-statistic: 48.09 on 1 and 8 DF, p-value: 0.0001202
plot(x,y,col="lightgreen",lwd=5,pch=23,main="plot of y vs x",xlab="x",ylab="y",type="b")
lines(d$x,mylm$fitted.values)
=c(7.3,2.9,6.78)
x1=data.frame(x=x1)
dpred dpred
x |
---|
7.30 |
2.90 |
6.78 |
=predict(mylm,dpred,add=T)
ypred ypred
1 2 3
235.7457 130.7422 223.3362
$ypred=ypred
dpred dpred
x | ypred |
---|---|
7.30 | 235.7457 |
2.90 | 130.7422 |
6.78 | 223.3362 |
polynomial regression with r
=lm(y~poly(x,3),d)
mypollm mypollm
Call:
lm(formula = y ~ poly(x, 3), data = d)
Coefficients:
(Intercept) poly(x, 3)1 poly(x, 3)2 poly(x, 3)3
192.79 216.76 -20.32 16.45
summary(mypollm)
Call:
lm(formula = y ~ poly(x, 3), data = d)
Residuals:
Min 1Q Median 3Q Max
-58.309 -10.064 -1.222 22.042 35.378
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 192.79 10.90 17.683 2.1e-06 ***
poly(x, 3)1 216.76 34.48 6.287 0.000754 ***
poly(x, 3)2 -20.32 34.48 -0.589 0.577168
poly(x, 3)3 16.45 34.48 0.477 0.650171
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 34.48 on 6 degrees of freedom
Multiple R-squared: 0.8699, Adjusted R-squared: 0.8048
F-statistic: 13.37 on 3 and 6 DF, p-value: 0.00458
plot(x,y,col="lightgreen",lwd=5,pch=23,main="plot of y vs x",xlab="x",ylab="y",type="b")
lines(d$x,mylm$fitted.values,add=T,col="red")
Warning in plot.xy(xy.coords(x, y), type = type, ...): "add" is not a graphical
parameter
lines(d$x,mypollm$fitted.values,add=T,col="blue")
Warning in plot.xy(xy.coords(x, y), type = type, ...): "add" is not a graphical
parameter
=c(7.3,2.9,6.78)
x1=data.frame(x=x1)
dpred dpred
x |
---|
7.30 |
2.90 |
6.78 |
=predict(mylm,dpred,add=T)
ypred ypred
1 2 3
235.7457 130.7422 223.3362
$ypred=ypred
dpred dpred
x | ypred |
---|---|
7.30 | 235.7457 |
2.90 | 130.7422 |
6.78 | 223.3362 |
fitting values
multivariate regression using lm
creating data frame
=25
n=runif(n,-50,50)
x1=rnorm(n,50,20)
x2=runif(n,-100,100)
x3=3*x1+4*x2+5*x3+56+runif(n,-100,100)
y=data.frame(x1,x2,x3,y)
d d
x1 | x2 | x3 | y |
---|---|---|---|
45.683335 | 48.88628 | -13.021452 | 410.54782 |
-4.666584 | 34.31236 | 96.991396 | 624.45243 |
17.757064 | 35.32994 | 78.610223 | 555.78616 |
7.263340 | 45.68269 | 77.293812 | 736.53524 |
-39.707532 | 43.30174 | -64.989470 | -170.74371 |
39.982497 | 28.28602 | -73.860862 | -151.75389 |
-25.391227 | 48.29153 | 30.620385 | 335.95132 |
-45.794047 | 71.41221 | -31.296706 | 138.60142 |
-17.207928 | 47.09213 | 31.351626 | 366.59953 |
45.450365 | 26.68910 | -35.925351 | 100.38281 |
38.953932 | 33.62969 | -62.461776 | 24.65035 |
19.280341 | 63.69872 | 56.458860 | 614.89433 |
14.050681 | 43.59887 | -81.281003 | -172.31348 |
49.426978 | 23.76955 | -6.644192 | 210.09171 |
15.570580 | 38.00783 | 2.301092 | 240.14631 |
20.853047 | 47.41179 | 19.997792 | 505.03909 |
4.406602 | 67.73472 | -33.435292 | 103.82270 |
9.414202 | 46.97208 | -2.277393 | 178.95276 |
-21.084026 | 56.59582 | 90.894766 | 601.98643 |
-35.288635 | -14.54646 | -3.419521 | -87.14791 |
46.302423 | 34.56416 | 78.070044 | 747.36545 |
40.229905 | 55.73097 | 82.887637 | 892.33061 |
19.070528 | 25.58976 | 21.746997 | 358.90543 |
29.546742 | 58.69101 | -17.862045 | 337.50958 |
-47.538632 | 66.00354 | -70.581062 | -171.27991 |
cor(d)
x1 x2 x3 y
x1 1.00000000 -0.18692405 0.07964991 0.3060026
x2 -0.18692405 1.00000000 0.01415521 0.2037604
x3 0.07964991 0.01415521 1.00000000 0.9236798
y 0.30600259 0.20376042 0.92367978 1.0000000
=lm(y~.,d)
mylm mylm
Call:
lm(formula = y ~ ., data = d)
Coefficients:
(Intercept) x1 x2 x3
50.484 2.858 4.243 5.015
summary(mylm)
Call:
lm(formula = y ~ ., data = d)
Residuals:
Min 1Q Median 3Q Max
-89.607 -44.720 -0.414 42.097 93.505
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 50.4841 34.6988 1.455 0.16
x1 2.8583 0.4278 6.682 1.30e-06 ***
x2 4.2426 0.7281 5.827 8.77e-06 ***
x3 5.0154 0.2299 21.813 6.57e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 63.46 on 21 degrees of freedom
Multiple R-squared: 0.9647, Adjusted R-squared: 0.9596
F-statistic: 191.1 on 3 and 21 DF, p-value: 2.121e-15
deciding input variable values to predict y
=3
k=runif(k,min(x1),max(x1))
x1=runif(k,min(x2),max(x2))
x2=runif(k,min(x3),max(x3))
x3=data.frame(x1,x2,x3)
dpred dpred
x1 | x2 | x3 |
---|---|---|
16.44301 | 69.67773 | -8.282921 |
32.14824 | 23.22649 | -79.415006 |
28.70364 | 12.24705 | -48.505707 |
=predict(mylm,dpred)
ypred ypred
1 2 3
351.55498 -157.38728 -58.79084
$ypred=ypred
dpred dpred
x1 | x2 | x3 | ypred |
---|---|---|---|
16.44301 | 69.67773 | -8.282921 | 351.55498 |
32.14824 | 23.22649 | -79.415006 | -157.38728 |
28.70364 | 12.24705 | -48.505707 | -58.79084 |
closing r environment
r_end()