descriptive statistics & linear regression using r

Author

kirit ved

Published

November 9, 2023

lecture-3

author’s details

author’s image

kirit ved’s image

author’s web site

https://kiritved.com

loading r libraries & udf

r_start=function(){
  rm(list=ls(all.names=T))
  setwd("d:/met/met_lect4")
  set.seed(123)
  
  if(!require(pacman)) {
    install.packages("pacman")
  }
  
  library(pacman)
   
   
}
r_end=function(){
  
  rm(list=ls(all.names = T))
    
      }

descriptive statistics

#|label: "section-2"
r_start()
Loading required package: pacman
p_load(tidyverse,janitor,psych,DescTools,kableExtra)
p_loaded()
 [1] "kableExtra" "DescTools"  "psych"      "janitor"    "lubridate" 
 [6] "forcats"    "stringr"    "dplyr"      "purrr"      "readr"     
[11] "tidyr"      "tibble"     "ggplot2"    "tidyverse"  "pacman"    
d=iris|>janitor::clean_names()
head(d)
sepal_length sepal_width petal_length petal_width species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
tail(d)
sepal_length sepal_width petal_length petal_width species
145 6.7 3.3 5.7 2.5 virginica
146 6.7 3.0 5.2 2.3 virginica
147 6.3 2.5 5.0 1.9 virginica
148 6.5 3.0 5.2 2.0 virginica
149 6.2 3.4 5.4 2.3 virginica
150 5.9 3.0 5.1 1.8 virginica
str(d)
'data.frame':   150 obs. of  5 variables:
 $ sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ sepal_width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ petal_length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(d)
  sepal_length    sepal_width     petal_length    petal_width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       species  
 setosa    :50  
 versicolor:50  
 virginica :50  
                
                
                
psych::describe(d)
vars n mean sd median trimmed mad min max range skew kurtosis se
sepal_length 1 150 5.843333 0.8280661 5.80 5.808333 1.03782 4.3 7.9 3.6 0.3086407 -0.6058125 0.0676113
sepal_width 2 150 3.057333 0.4358663 3.00 3.043333 0.44478 2.0 4.4 2.4 0.3126147 0.1387047 0.0355883
petal_length 3 150 3.758000 1.7652982 4.35 3.760000 1.85325 1.0 6.9 5.9 -0.2694109 -1.4168574 0.1441360
petal_width 4 150 1.199333 0.7622377 1.30 1.184167 1.03782 0.1 2.5 2.4 -0.1009166 -1.3581792 0.0622364
species* 5 150 2.000000 0.8192319 2.00 2.000000 1.48260 1.0 3.0 2.0 0.0000000 -1.5199333 0.0668900
DescTools::Desc(d,main="desctool library of r")
------------------------------------------------------------------------------ 
Describe d (data.frame):

data frame: 150 obs. of  5 variables
        150 complete cases (100.0%)

  Nr  ColName       Class    NAs  Levels                                  
  1   sepal_length  numeric  .                                            
  2   sepal_width   numeric  .                                            
  3   petal_length  numeric  .                                            
  4   petal_width   numeric  .                                            
  5   species       factor   .    (3): 1-setosa, 2-versicolor, 3-virginica


------------------------------------------------------------------------------ 
desctool library of r

  length       n    NAs  unique     0s   mean  meanCI'
     150     150      0      35      0  5.843   5.710
          100.0%   0.0%           0.0%          5.977
                                                     
     .05     .10    .25  median    .75    .90     .95
   4.600   4.800  5.100   5.800  6.400  6.900   7.255
                                                     
   range      sd  vcoef     mad    IQR   skew    kurt
   3.600   0.828  0.142   1.038  1.300  0.309  -0.606
                                                     
lowest : 4.3, 4.4 (3), 4.5, 4.6 (4), 4.7 (2)
highest: 7.3, 7.4, 7.6, 7.7 (4), 7.9

' 95%-CI (classic)

------------------------------------------------------------------------------ 
desctool library of r

  length       n    NAs  unique     0s   mean  meanCI'
     150     150      0      23      0  3.057   2.987
          100.0%   0.0%           0.0%          3.128
                                                     
     .05     .10    .25  median    .75    .90     .95
   2.345   2.500  2.800   3.000  3.300  3.610   3.800
                                                     
   range      sd  vcoef     mad    IQR   skew    kurt
   2.400   0.436  0.143   0.445  0.500  0.313   0.139
                                                     
lowest : 2.0, 2.2 (3), 2.3 (4), 2.4 (3), 2.5 (8)
highest: 3.9 (2), 4.0, 4.1, 4.2, 4.4

heap(?): remarkable frequency (17.3%) for the mode(s) (= 3)

' 95%-CI (classic)

------------------------------------------------------------------------------ 
desctool library of r

  length       n    NAs  unique    0s   mean  meanCI'
     150     150      0      43     0   3.76    3.47
          100.0%   0.0%          0.0%           4.04
                                                    
     .05     .10    .25  median   .75    .90     .95
    1.30    1.40   1.60    4.35  5.10   5.80    6.10
                                                    
   range      sd  vcoef     mad   IQR   skew    kurt
    5.90    1.77   0.47    1.85  3.50  -0.27   -1.42
                                                    
lowest : 1.0, 1.1, 1.2 (2), 1.3 (7), 1.4 (13)
highest: 6.3, 6.4, 6.6, 6.7 (2), 6.9

heap(?): remarkable frequency (8.7%) for the mode(s) (= 1.4, 1.5)

' 95%-CI (classic)

------------------------------------------------------------------------------ 
desctool library of r

  length       n    NAs  unique    0s   mean  meanCI'
     150     150      0      22     0   1.20    1.08
          100.0%   0.0%          0.0%           1.32
                                                    
     .05     .10    .25  median   .75    .90     .95
    0.20    0.20   0.30    1.30  1.80   2.20    2.30
                                                    
   range      sd  vcoef     mad   IQR   skew    kurt
    2.40    0.76   0.64    1.04  1.50  -0.10   -1.36
                                                    
lowest : 0.1 (5), 0.2 (29), 0.3 (7), 0.4 (7), 0.5
highest: 2.1 (6), 2.2 (3), 2.3 (8), 2.4 (3), 2.5 (3)

heap(?): remarkable frequency (19.3%) for the mode(s) (= 0.2)

' 95%-CI (classic)

------------------------------------------------------------------------------ 
desctool library of r

  length      n    NAs unique levels  dupes
     150    150      0      3      3      y
         100.0%   0.0%                     

        level  freq   perc  cumfreq  cumperc
1      setosa    50  33.3%       50    33.3%
2  versicolor    50  33.3%      100    66.7%
3   virginica    50  33.3%      150   100.0%

r_end()

linear regression

#|label: "section-3"
n=10
x=1:n
y=23.67*x+54.78+runif(n,-50,50)
d=data.frame(x,y)
d
x y
1 57.20775
2 130.95051
3 116.68769
4 187.76174
5 217.17673
6 151.35565
7 223.28055
8 283.38190
9 272.95350
10 287.14147

checking correlation

cor(d$x,d$y)
[1] 0.9259506

fitting values

mylm=lm(y~x,d);mylm;summary(mylm)

Call:
lm(formula = y ~ x, data = d)

Coefficients:
(Intercept)            x  
      61.54        23.86  

Call:
lm(formula = y ~ x, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-53.366 -15.590  -4.334  28.498  36.319 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   61.535     21.352   2.882  0.02045 *  
x             23.864      3.441   6.935  0.00012 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 31.26 on 8 degrees of freedom
Multiple R-squared:  0.8574,    Adjusted R-squared:  0.8396 
F-statistic: 48.09 on 1 and 8 DF,  p-value: 0.0001202
plot(x,y,col="lightgreen",lwd=5,pch=23,main="plot of y vs x",xlab="x",ylab="y",type="b")
lines(d$x,mylm$fitted.values)

x1=c(7.3,2.9,6.78)
dpred=data.frame(x=x1)
dpred
x
7.30
2.90
6.78
ypred=predict(mylm,dpred,add=T)
ypred
       1        2        3 
235.7457 130.7422 223.3362 
dpred$ypred=ypred
dpred
x ypred
7.30 235.7457
2.90 130.7422
6.78 223.3362

polynomial regression with r

mypollm=lm(y~poly(x,3),d)
mypollm

Call:
lm(formula = y ~ poly(x, 3), data = d)

Coefficients:
(Intercept)  poly(x, 3)1  poly(x, 3)2  poly(x, 3)3  
     192.79       216.76       -20.32        16.45  
summary(mypollm)

Call:
lm(formula = y ~ poly(x, 3), data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-58.309 -10.064  -1.222  22.042  35.378 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   192.79      10.90  17.683  2.1e-06 ***
poly(x, 3)1   216.76      34.48   6.287 0.000754 ***
poly(x, 3)2   -20.32      34.48  -0.589 0.577168    
poly(x, 3)3    16.45      34.48   0.477 0.650171    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 34.48 on 6 degrees of freedom
Multiple R-squared:  0.8699,    Adjusted R-squared:  0.8048 
F-statistic: 13.37 on 3 and 6 DF,  p-value: 0.00458
plot(x,y,col="lightgreen",lwd=5,pch=23,main="plot of y vs x",xlab="x",ylab="y",type="b")
lines(d$x,mylm$fitted.values,add=T,col="red")
Warning in plot.xy(xy.coords(x, y), type = type, ...): "add" is not a graphical
parameter
lines(d$x,mypollm$fitted.values,add=T,col="blue")
Warning in plot.xy(xy.coords(x, y), type = type, ...): "add" is not a graphical
parameter

x1=c(7.3,2.9,6.78)
dpred=data.frame(x=x1)
dpred
x
7.30
2.90
6.78
ypred=predict(mylm,dpred,add=T)
ypred
       1        2        3 
235.7457 130.7422 223.3362 
dpred$ypred=ypred
dpred
x ypred
7.30 235.7457
2.90 130.7422
6.78 223.3362

fitting values

multivariate regression using lm

creating data frame

n=25
x1=runif(n,-50,50)
x2=rnorm(n,50,20)
x3=runif(n,-100,100)
y=3*x1+4*x2+5*x3+56+runif(n,-100,100)
d=data.frame(x1,x2,x3,y)
d
x1 x2 x3 y
45.683335 48.88628 -13.021452 410.54782
-4.666584 34.31236 96.991396 624.45243
17.757064 35.32994 78.610223 555.78616
7.263340 45.68269 77.293812 736.53524
-39.707532 43.30174 -64.989470 -170.74371
39.982497 28.28602 -73.860862 -151.75389
-25.391227 48.29153 30.620385 335.95132
-45.794047 71.41221 -31.296706 138.60142
-17.207928 47.09213 31.351626 366.59953
45.450365 26.68910 -35.925351 100.38281
38.953932 33.62969 -62.461776 24.65035
19.280341 63.69872 56.458860 614.89433
14.050681 43.59887 -81.281003 -172.31348
49.426978 23.76955 -6.644192 210.09171
15.570580 38.00783 2.301092 240.14631
20.853047 47.41179 19.997792 505.03909
4.406602 67.73472 -33.435292 103.82270
9.414202 46.97208 -2.277393 178.95276
-21.084026 56.59582 90.894766 601.98643
-35.288635 -14.54646 -3.419521 -87.14791
46.302423 34.56416 78.070044 747.36545
40.229905 55.73097 82.887637 892.33061
19.070528 25.58976 21.746997 358.90543
29.546742 58.69101 -17.862045 337.50958
-47.538632 66.00354 -70.581062 -171.27991
cor(d)
            x1          x2         x3         y
x1  1.00000000 -0.18692405 0.07964991 0.3060026
x2 -0.18692405  1.00000000 0.01415521 0.2037604
x3  0.07964991  0.01415521 1.00000000 0.9236798
y   0.30600259  0.20376042 0.92367978 1.0000000
mylm=lm(y~.,d)
mylm

Call:
lm(formula = y ~ ., data = d)

Coefficients:
(Intercept)           x1           x2           x3  
     50.484        2.858        4.243        5.015  
summary(mylm)

Call:
lm(formula = y ~ ., data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-89.607 -44.720  -0.414  42.097  93.505 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  50.4841    34.6988   1.455     0.16    
x1            2.8583     0.4278   6.682 1.30e-06 ***
x2            4.2426     0.7281   5.827 8.77e-06 ***
x3            5.0154     0.2299  21.813 6.57e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 63.46 on 21 degrees of freedom
Multiple R-squared:  0.9647,    Adjusted R-squared:  0.9596 
F-statistic: 191.1 on 3 and 21 DF,  p-value: 2.121e-15

deciding input variable values to predict y

k=3
x1=runif(k,min(x1),max(x1))
x2=runif(k,min(x2),max(x2))
x3=runif(k,min(x3),max(x3))
dpred=data.frame(x1,x2,x3)
dpred
x1 x2 x3
16.44301 69.67773 -8.282921
32.14824 23.22649 -79.415006
28.70364 12.24705 -48.505707
ypred=predict(mylm,dpred)
ypred
         1          2          3 
 351.55498 -157.38728  -58.79084 
dpred$ypred=ypred
dpred
x1 x2 x3 ypred
16.44301 69.67773 -8.282921 351.55498
32.14824 23.22649 -79.415006 -157.38728
28.70364 12.24705 -48.505707 -58.79084

closing r environment

r_end()