# table 1
favstats(~Wheelbase, data = Cars93, groups = Origin)
## Origin min Q1 median Q3 max mean sd n missing
## 1 USA 90 101 105 111 119 105.7292 6.790919 48 0
## 2 non-USA 90 97 103 106 115 102.0444 6.388753 45 0
usa<-Cars93[Cars93$Origin == "USA",]
not_usa<-Cars93[Cars93$Origin != "USA",]
ggplot(data = Cars93, aes(x = Wheelbase, facet = Origin, fill = Origin))+
facet_grid(rows = vars(Origin))+
geom_histogram(aes(y = ..density..),bins = 12, color = "black")+
geom_density(aes(y = ..density..), fill = "transparent")+
theme_light()+
labs(title = "Wheelbase Count by Origin", x = "Wheelbase (in)")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = Cars93, aes(x = Origin, y = Wheelbase, fill = Origin))+
geom_boxplot()+
geom_point()+
theme_light()+
labs(title = "Cars93: Wheelbase (USA vs. non-USA)", y = "Wheelbase (in)")+
theme(plot.title = element_text(hjust = 0.5))
cars1 <- Cars93 %>%
select(Price, Horsepower, MPG.city)%>%
mutate(log_price = log(Price))%>%
data.table()
cor_ph<-cor(cars1$Price~cars1$Horsepower)
cor_pm<-cor(cars1$Price~cars1$MPG.city)
cor_logph<-cor(cars1$log_price~cars1$Horsepower)
cor_logpm<-cor(cars1$log_price~cars1$MPG.city)
cbind(cor_ph,cor_logph,cor_pm,cor_logpm)
## cor_ph cor_logph cor_pm cor_logpm
## [1,] 0.7882176 0.824668 -0.5945622 -0.7114318
attach(cars1)
horse_reg= lm(log_price~Horsepower)
summary(horse_reg)
##
## Call:
## lm(formula = log_price ~ Horsepower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73316 -0.16831 -0.01999 0.14305 0.73621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8357418 0.0787413 23.31 <2e-16 ***
## Horsepower 0.0071593 0.0005147 13.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2586 on 91 degrees of freedom
## Multiple R-squared: 0.6801, Adjusted R-squared: 0.6766
## F-statistic: 193.4 on 1 and 91 DF, p-value: < 2.2e-16
plot(log_price~Horsepower)
abline(horse_reg)
par(mfrow= c(2,2))
plot(horse_reg, main = "lm horse")
par(mfrow = c(1,1))
detach(cars1)
attach(cars1)
mpg_reg= lm(log_price~MPG.city)
summary(mpg_reg)
##
## Call:
## lm(formula = log_price ~ MPG.city)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58391 -0.19678 -0.04151 0.19854 1.06634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.15282 0.13741 30.223 < 2e-16 ***
## MPG.city -0.05756 0.00596 -9.657 1.33e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3213 on 91 degrees of freedom
## Multiple R-squared: 0.5061, Adjusted R-squared: 0.5007
## F-statistic: 93.26 on 1 and 91 DF, p-value: 1.33e-15
plot(log_price~MPG.city)
abline(mpg_reg)
par(mfrow= c(2,2))
plot(mpg_reg, main = "lm MPG")
par(mfrow = c(1,1))
detach(cars1)
polynomial <- Vectorize(function(x, coefs) {
n <- length(coefs)
sum(coefs*x^(1:n-1))
}, "x")
attach(cars1)
n = length(Horsepower)
quad.horse_reg = lm (log_price ~ Horsepower + I(Horsepower^2))
summary(quad.horse_reg)
##
## Call:
## lm(formula = log_price ~ Horsepower + I(Horsepower^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47543 -0.15301 -0.02586 0.13255 0.71142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.168e+00 1.716e-01 6.808 1.07e-09 ***
## Horsepower 1.614e-02 2.149e-03 7.513 4.07e-11 ***
## I(Horsepower^2) -2.670e-05 6.232e-06 -4.286 4.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.237 on 90 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7284
## F-statistic: 124.4 on 2 and 90 DF, p-value: < 2.2e-16
plot(log_price~Horsepower)
curve(polynomial(x, coef(quad.horse_reg)), add =T)
par(mfrow= c(2,2))
plot(quad.horse_reg, main = "quad horse")
par(mfrow = c(1,1))
detach(cars1)
attach(cars1)
n = length(MPG.city)
quad.MPG_reg = lm (log_price ~ MPG.city + I(MPG.city^2))
summary(quad.MPG_reg)
##
## Call:
## lm(formula = log_price ~ MPG.city + I(MPG.city^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70247 -0.21576 -0.03201 0.19133 1.03824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7929035 0.4093698 14.151 < 2e-16 ***
## MPG.city -0.1883215 0.0315262 -5.973 4.56e-08 ***
## I(MPG.city^2) 0.0024169 0.0005738 4.212 6.00e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2953 on 90 degrees of freedom
## Multiple R-squared: 0.5874, Adjusted R-squared: 0.5783
## F-statistic: 64.08 on 2 and 90 DF, p-value: < 2.2e-16
plot(log_price~MPG.city)
curve(polynomial(x, coef(quad.MPG_reg)), add =T)
par(mfrow= c(2,2))
plot(quad.MPG_reg, main = "quad MPG")
par(mfrow = c(1,1))
detach(cars1)
The quadratic models appear to fit better for both Horsepower and MPG.city for log(price)
horse150<-predict(quad.horse_reg, data.frame(Horsepower = 150))
exp(horse150)
## 1
## 19.86836
city20 <-predict(quad.MPG_reg, data.frame(MPG.city = 20))
exp(city20)
## 1
## 19.94976
# USA vs. not USA Origin sets
usa<-Cars93[Cars93$Origin == "USA",]
not_usa<-Cars93[Cars93$Origin != "USA",]
favstats(~Passengers, data = Cars93, groups = Origin)
## Origin min Q1 median Q3 max mean sd n missing
## 1 USA 2 5 5 6 8 5.333333 1.0784807 48 0
## 2 non-USA 2 4 5 5 7 4.822222 0.9363587 45 0
plot(Cars93$Passengers ~ Cars93$Origin)
t.test(Cars93$Passengers ~ Cars93$Origin, var.equal = F)
##
## Welch Two Sample t-test
##
## data: Cars93$Passengers by Cars93$Origin
## t = 2.4445, df = 90.482, p-value = 0.01644
## alternative hypothesis: true difference in means between group USA and group non-USA is not equal to 0
## 95 percent confidence interval:
## 0.09576277 0.92645945
## sample estimates:
## mean in group USA mean in group non-USA
## 5.333333 4.822222
wilcox.test(Cars93$Passengers~Cars93$Origin,
conf.int = T, conf.level = 0.95)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Cars93$Passengers by Cars93$Origin
## W = 1418, p-value = 0.005993
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 5.995107e-05 9.999604e-01
## sample estimates:
## difference in location
## 0.9999893
attach(usa)
shapiro.test(Passengers)
##
## Shapiro-Wilk normality test
##
## data: Passengers
## W = 0.91186, p-value = 0.001553
qqnorm(Passengers);qqline(Passengers)
hist(Passengers, freq = F, prob = T, ylim = c(0,0.7))
lines(density(Passengers))
curve(dnorm(x, mean(Passengers), sd(Passengers)), lty = 2, add = T)
detach(usa)
attach(not_usa)
shapiro.test(Passengers)
##
## Shapiro-Wilk normality test
##
## data: Passengers
## W = 0.79106, p-value = 1.521e-06
qqnorm(Passengers);qqline(Passengers)
hist(Passengers, freq = F, prob = T, ylim = c(0,0.7))
lines(density(Passengers))
curve(dnorm(x, mean(Passengers), sd(Passengers)), lty =2, add = T)
detach(not_usa)
# X ~ Bin(length(Cars93$Type), 1/6)
p = 1/6
obs<-counts(Cars93$Type)
type_test <- chisq.test(obs)
attach(type_test)
cbind(observed, expected, residuals)
## observed expected residuals
## n_Compact 16 15.5 0.1270001
## n_Large 11 15.5 -1.1430011
## n_Midsize 22 15.5 1.6510017
## n_Small 21 15.5 1.3970014
## n_Sporty 14 15.5 -0.3810004
## n_Van 9 15.5 -1.6510017
type_test
##
## Chi-squared test for given probabilities
##
## data: obs
## X-squared = 8.871, df = 5, p-value = 0.1143
detach(type_test)
# function for replication n = 10 M = 1000
rng.bin <- function(d, n) {
M = 1000
Mean <- replicate(M, {
x <- d(n)
mean(x)})
Median <- replicate(M,{
x2 <- d(n)
median(x2)})
Midrange <- replicate(M, {
x <- d(n)
(min(x)+max(x))/2})
Midquartile <- replicate(M, {
x <- d(n)
as.numeric((quantile(x, 0.25)+quantile(x,0.75))/2)})
estimator_df <- as.data.frame(cbind(Mean, Median, Midrange, Midquartile))
return(estimator_df)
}
# Create Boxplot, density grid func, and Pairs plots
dens <- function(df, lab) {
par(mfrow = c(1,1))
boxplot(df, main = paste("Boxplot: ", lab))
par(mfrow = c(2,2))
attach(df)
plot(density(Mean), main = paste("Mean: ",lab))
plot(density(Median), main = paste("Median: ",lab))
plot(density(Midrange), main = paste("Midrange: ",lab))
plot(density(Midquartile), main = paste("Midquartile: ",lab))
plot(df, main = paste(lab))
detach(df)
par(mfrow = c(1,1))
}
# Func for Means and SDs
mu_sd <- function(df, x) {
return <- df %>%
summarise(across(.cols = everything(), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))
col_data<-data.frame(t((return)))
colnames(col_data)[1] <- x
return(col_data)
}
n10 <- rng.bin(rnorm, 10 )
dens(n10, "Norm, n = 10")
mu_sd(n10, "Norm, n = 10")
## Norm, n = 10
## Mean_mean 0.0093886807
## Mean_sd 0.3200139305
## Median_mean 0.0008039189
## Median_sd 0.3704201208
## Midrange_mean 0.0317664200
## Midrange_sd 0.4261258694
## Midquartile_mean -0.0092606765
## Midquartile_sd 0.3332977366
n30 <- rng.bin(rnorm, 30 )
dens(n30, "Norm, n = 30")
mu_sd(n30, "Norm, n = 30")
## Norm, n = 30
## Mean_mean 0.009681523
## Mean_sd 0.184780417
## Median_mean -0.015507847
## Median_sd 0.222052655
## Midrange_mean -0.002336574
## Midrange_sd 0.349061620
## Midquartile_mean 0.005120200
## Midquartile_sd 0.200162142
n100 <- rng.bin(rnorm, 100)
dens(n100, "Norm, n =100")
mu_sd(n100, "Norm, n =100")
## Norm, n =100
## Mean_mean -0.007929331
## Mean_sd 0.099420319
## Median_mean 0.009547294
## Median_sd 0.125060519
## Midrange_mean -0.008327788
## Midrange_sd 0.304087756
## Midquartile_mean -0.004259058
## Midquartile_sd 0.108826054
u10 <- rng.bin(runif, 10 )
dens(u10, "Unif, n = 10")
mu_sd(u10, "Unif, n = 10")
## Unif, n = 10
## Mean_mean 0.50492587
## Mean_sd 0.09299802
## Median_mean 0.49778008
## Median_sd 0.13601627
## Midrange_mean 0.50080013
## Midrange_sd 0.06398980
## Midquartile_mean 0.50539323
## Midquartile_sd 0.10396846
u30 <- rng.bin(runif, 30 )
dens(u30, "Unif, n = 30")
mu_sd(u30, "Unif, n = 30")
## Unif, n = 30
## Mean_mean 0.50094382
## Mean_sd 0.05273066
## Median_mean 0.49943649
## Median_sd 0.08756694
## Midrange_mean 0.50007279
## Midrange_sd 0.02262278
## Midquartile_mean 0.49859311
## Midquartile_sd 0.06180163
u100 <- rng.bin(runif, 100)
dens(u100, "Unif, n = 100")
mu_sd(u100, "Unif, n = 100")
## Unif, n = 100
## Mean_mean 0.500972013
## Mean_sd 0.029176312
## Median_mean 0.502921383
## Median_sd 0.047463268
## Midrange_mean 0.499731075
## Midrange_sd 0.006828975
## Midquartile_mean 0.500010681
## Midquartile_sd 0.034598230
e10 <- rng.bin(rexp , 10 )
dens(e10, "Exp, n = 10")
mu_sd(e10, "Exp, n = 10")
## Exp, n = 10
## Mean_mean 0.9942881
## Mean_sd 0.3197017
## Median_mean 0.7422520
## Median_sd 0.3088651
## Midrange_mean 1.4986196
## Midrange_sd 0.6197144
## Midquartile_mean 0.8590849
## Midquartile_sd 0.3071255
e30 <- rng.bin(rexp , 30 )
dens(e30, "Exp, n = 30")
mu_sd(e30, "Exp, n = 30")
## Exp, n = 30
## Mean_mean 1.0005703
## Mean_sd 0.1771024
## Median_mean 0.7128236
## Median_sd 0.1791856
## Midrange_mean 2.0338167
## Midrange_sd 0.6743531
## Midquartile_mean 0.8401354
## Midquartile_sd 0.1776611
e100 <- rng.bin(rexp , 100)
dens(e100, "Exp, n = 100")
mu_sd(e100, "Exp, n = 100")
## Exp, n = 100
## Mean_mean 1.0004083
## Mean_sd 0.1020926
## Median_mean 0.7019587
## Median_sd 0.1023037
## Midrange_mean 2.5892140
## Midrange_sd 0.6569092
## Midquartile_mean 0.8372014
## Midquartile_sd 0.1009930
# write func for CV
cv <- function(items) {
cv_value = (sd(items)/mean(items))
return(cv_value)
}
# write bootstrap func for CV
cv.boot <- function(df) {
set.seed(23947)
boot_set <- replicate(1000, {
cv(sample(df, length(df), replace = T))
})
return(boot_set)
}
# run bootstrap CV func for engine size
cv.engine <- cv.boot(Cars93$EngineSize)
summary(cv.engine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2943 0.3674 0.3849 0.3846 0.4016 0.4626
sd(cv.engine)
## [1] 0.02523797
#plot estimated sampling distribution
hist(cv.engine, freq = F)
lines(density(cv.engine))
abline(v=cv(Cars93$EngineSize), col = "red")
qqnorm(cv.engine);qqline(cv.engine)
# CV for original sample
cv(Cars93$EngineSize)
## [1] 0.3888543
# 90% conf. int (t- and percentile) for estimate of CV
n = length(Cars93$EngineSize)
(sd(Cars93$EngineSize)/mean(Cars93$EngineSize))+
c(-1,1)*qt(0.95, df=n-1)*sd(cv.engine)
## [1] 0.3469193 0.4307894
quantile(cv.engine, c(0.05,0.95))
## 5% 95%
## 0.3426848 0.4266505
# USA vs. not USA Origin sets
usa<-Cars93[Cars93$Origin == "USA",]
not_usa<-Cars93[Cars93$Origin != "USA",]
# CV for Engine size diff by origin for reference
diff.cv = cv(usa$EngineSize)-cv(not_usa$EngineSize)
diff.cv
## [1] 0.05038946
# create df for permutation test
df_engine<-Cars93 %>%
mutate(Origin_num = if_else (Origin == "USA", 1,2))%>%
dplyr::select(EngineSize, Origin_num)%>%
data.frame()
eng.grp = df_engine$Origin_num
n=length(df_engine$EngineSize)
B = 100000
attach(df_engine)
cv.diff = c()
set.seed(34532)
cv.diff <- replicate(B, {
res = sample(eng.grp, n, replace = F)
cv(EngineSize[res==1])-cv(EngineSize[res==2])
})
detach(df_engine)
summary(cv.diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2042446 -0.0358930 0.0003332 0.0001815 0.0363575 0.2321809
hist(cv.diff, freq = F)
lines(density(cv.diff))
abline(v=diff.cv, col = "red")
p.value = sum(cv.diff > diff.cv)/B
p.value
## [1] 0.17225