Environment Setting

Package Load

#Generate practice dataset (once you generate, then annotate it)
#data <- mtcars
#write.csv(data, 'data/data.csv')

library(psych)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()

library(sm)

## Package 'sm', version 2.2-5.7: type help(sm) for summary information

Read data

data <- read.csv('data/data.csv')

Analysis

Descriptive Statistics

summary(data)

##       X                  mpg             cyl             disp      
##  Length:32          Min.   :10.40   Min.   :4.000   Min.   : 71.1  
##  Class :character   1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8  
##  Mode  :character   Median :19.20   Median :6.000   Median :196.3  
##                     Mean   :20.09   Mean   :6.188   Mean   :230.7  
##                     3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0  
##                     Max.   :33.90   Max.   :8.000   Max.   :472.0  
##        hp             drat             wt             qsec      
##  Min.   : 52.0   Min.   :2.760   Min.   :1.513   Min.   :14.50  
##  1st Qu.: 96.5   1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89  
##  Median :123.0   Median :3.695   Median :3.325   Median :17.71  
##  Mean   :146.7   Mean   :3.597   Mean   :3.217   Mean   :17.85  
##  3rd Qu.:180.0   3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90  
##  Max.   :335.0   Max.   :4.930   Max.   :5.424   Max.   :22.90  
##        vs               am              gear            carb      
##  Min.   :0.0000   Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4375   Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :8.000

describe(data)

##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## X*      1 32  16.50   9.38  16.50   16.50  11.86  1.00  32.00  31.00  0.00
## mpg     2 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     3 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    4 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      5 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    6 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      7 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    8 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      9 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am     10 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   11 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   12 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## X*      -1.31  1.66
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

Manipulation

data_a <- filter(data, am == 0) 
data_b <- filter(data, am == 1)

Visualization

Histogram

hist(data$qsec)

plot(density(data$qsec))

sm.density.compare(data$qsec, data$am, model = "equal")

## Test of equal densities:  p-value =  0.46

#am.f <- factor(data$am, levels = c(0,1),
#               labels = c('V shape', 'S shape'))
#colfill<-c(2:(2+length(levels(am.f))))
#legend(locator(1), levels(am.f), fill=colfill)

Boxplot

boxplot(qsec ~ am, data = data)

data %>%
    ggplot( 
        aes(x=am, y=qsec, group = am)
        ) +
    geom_boxplot() +
    geom_jitter(color="black", size=0.4, alpha=0.9)

Modeling

independent 2-group t-test

t.test(qsec ~ am, data = data) # where qsec is numeric and am is a binary factor

## 
##  Welch Two Sample t-test
## 
## data:  qsec by am
## t = 1.2878, df = 25.534, p-value = 0.2093
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.4918522  2.1381679
## sample estimates:
## mean in group 0 mean in group 1 
##        18.18316        17.36000

Regression Modeling

m1 <- lm(qsec ~ am, data = data)
summary(m1)

## 
## Call:
## lm(formula = qsec ~ am, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8600 -0.9583 -0.3516  1.2517  4.7168 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.1832     0.4056  44.833   <2e-16 ***
## am           -0.8232     0.6363  -1.294    0.206    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.768 on 30 degrees of freedom
## Multiple R-squared:  0.05284,    Adjusted R-squared:  0.02126 
## F-statistic: 1.674 on 1 and 30 DF,  p-value: 0.2057

Business Analytics Practice

Gary Pu

2022-10-14

Environment Setting

Package Load

Read data

Analysis

Descriptive Statistics

Manipulation

Visualization

Histogram

Boxplot

Modeling

independent 2-group t-test

Regression Modeling