library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at D:/MS Data Analytics/Classes Completed/STA 6443 Algorithms/Homeworks/Homework 1
library(ggplot2)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
here::here()
## [1] "D:/MS Data Analytics/Classes Completed/STA 6443 Algorithms/Homeworks/Homework 1"
cars=read.csv("Cars.csv") # read dataset
.bordered{
border-style: solid;
border-color: teal;
padding: 5px;
background-color: #DCDCDC;
}
(A) Create a combined mpg variable called MPG_Combo which combines 60% of the MPG_City and 40% of the MPG_Highway. Obtain a box plot for MPG_Combo and comment on what the plot tells us about fuel inefficiencies.
MPG_Combo <- 0.6*cars$MPG_City+0.4*cars$MPG_Highway # combined mpg varialbe
cars=data.frame(cars, MPG_Combo) # data frame with MPG_Combo
AV <- ggplot(data = cars)+
geom_boxplot(mapping = aes("", MPG_Combo)) +
xlab("MPG Combo") +
ylab("MPG")
AV
AH <- ggplot(data = cars)+
geom_boxplot(mapping = aes(MPG_Combo, "")) +
xlab("MPG") +
ylab("MPG Combo")
AH
(B) Obtain box plots for MPG_Combo by Type and comment on any differences you notice between the different vehicle types combined fuel efficiency.
B <- ggplot(cars, aes(x = Type, y = MPG_Combo, fill = Type))
B <- B + geom_boxplot()
B <- B + labs(title = " MPG Combo by Vehicle Type")
B
#
#
#
# boxplot(cars$MPG_Combo~cars$Type,
# main = "MPG Combo by Type",
# xlab = "Type",
# ylab = "MPG Combo",)
#
# boxplot(cars$MPG_Combo~cars$Type,
# xlab = "Cars",ylab = "MPG",
# main="MPG_Combo by Type",
# border=(c("black","black","black","black","black")),
# horizontal = FALSE,col=(c("red","blue","green","yellow","orange")))
(C) Obtain basic descriptive statistics for Horsepower for all vehicles. Comment on any general features and statistics of the data. Use visual and quantitative methods to comment on whether an assumption of Normality would be reasonable for Horsepower variable.
qqnorm(cars$Horsepower); qqline(cars$Horsepower, col=2)
CV <- ggplot(data = cars)+
geom_boxplot(mapping = aes("", Horsepower)) +
xlab("Combined Horsepower") +
ylab("Horsepower")
CV
hist(cars$Horsepower,main="Horsepower", xlab="Horsepower");
skewness(cars$Horsepower, na.rm=TRUE) # skewness function in package "e1071"
## [1] 0.9528091
shapiro.test(cars$Horsepower)
##
## Shapiro-Wilk normality test
##
## data: cars$Horsepower
## W = 0.94573, p-value = 2.32e-11
(D) Use visual and quantitative methods to comment on whether an assumption of normality would be reasonable for Horsepower variable by Type, especially for Sports, SUV, and Truck (i.e., check normality of Horsepower from Type of i) Sports, ii) SUV, and iii) Truck).
S1 <- filter(cars, Type == "Sports")
ggplot(S1, aes(x = Type, y = Horsepower)) +
geom_boxplot(fill = "green") +
labs(title = " Horsepower by Sports")
shapiro.test(cars[cars$Type=="Sports", "Horsepower"])
##
## Shapiro-Wilk normality test
##
## data: cars[cars$Type == "Sports", "Horsepower"]
## W = 0.94276, p-value = 0.01898
S2 <- filter(cars, Type == "SUV")
ggplot(S2, aes(x = Type, y = Horsepower)) +
geom_boxplot( fill = "violet") +
labs(title = " Horsepower by SUV")
shapiro.test(cars[cars$Type=="SUV", "Horsepower"])
##
## Shapiro-Wilk normality test
##
## data: cars[cars$Type == "SUV", "Horsepower"]
## W = 0.95945, p-value = 0.04423
S3 <- filter(cars, Type == "Truck")
ggplot(S3, aes(x = Type, y = Horsepower)) +
geom_boxplot(fill = "yellow") +
labs(title = " Horsepower by Truck")
shapiro.test(cars[cars$Type=="Truck", "Horsepower"])
##
## Shapiro-Wilk normality test
##
## data: cars[cars$Type == "Truck", "Horsepower"]
## W = 0.8951, p-value = 0.01697
(A) Which test should we perform, and why? Justify your answer based on findings on Exercise 1 (d).
(B) Specify null and alternative hypotheses.
(C) State the conclusion based on the test result.
cars_filtered <-cars #made a copy of cars to filter out data.
library(dplyr)
cars_filtered <- cars %>% filter(Type %in% c("SUV","Truck"))#retaining only SUV and Truck variables in new dataframe
boxplot(Horsepower ~ Type, data = cars_filtered, main="Horsepower between SUV and Truck",
xlab="SUV or Truck", ylab="Horsepower")
qqnorm(cars_filtered$Horsepower[cars_filtered$Type=="SUV"]);
qqline(cars_filtered$Horsepower[cars_filtered$Type=="Truck"], col = 2)
shapiro.test(cars_filtered$Horsepower[cars_filtered$Type=="SUV"])
##
## Shapiro-Wilk normality test
##
## data: cars_filtered$Horsepower[cars_filtered$Type == "SUV"]
## W = 0.95945, p-value = 0.04423
shapiro.test(cars_filtered$Horsepower[cars_filtered$Type=="Truck"])
##
## Shapiro-Wilk normality test
##
## data: cars_filtered$Horsepower[cars_filtered$Type == "Truck"]
## W = 0.8951, p-value = 0.01697
# non-parametric wilcox test
wilcox.test(Horsepower ~ Type, data=cars_filtered, exact=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Horsepower by Type
## W = 806.5, p-value = 0.3942
## alternative hypothesis: true location shift is not equal to 0
(A) Which test should we perform, and why? See QQ-plot and perform the Shapiro-Wilk test for normality check.
(B) Specify null and alternative hypotheses.
library(dplyr)
airquality_filtered <-airquality #made a copy of airquality to filter out data.
airquality <--airquality
airquality_filtered <--airquality
airquality_filtered <- airquality %>% filter(Month %in% c(-7,-8))#retaining only July and August variables in new dataframe
boxplot(Wind ~ Month, data = airquality_filtered, main="Wind Speed between August and July",
xlab="August and July", ylab="Wind Speed")
qqnorm(airquality_filtered$Wind[airquality_filtered$Month==-7]);
qqline(airquality_filtered$Wind[airquality_filtered$Month==-7], col = 2)
qqnorm(airquality_filtered$Wind[airquality_filtered$Month==-8]);
qqline(airquality_filtered$Wind[airquality_filtered$Month==-8], col = 2)
(C) State the conclusion based on the test result.
shapiro.test(airquality_filtered$Wind[airquality_filtered$Month==-7])
##
## Shapiro-Wilk normality test
##
## data: airquality_filtered$Wind[airquality_filtered$Month == -7]
## W = 0.95003, p-value = 0.1564
shapiro.test(airquality_filtered$Wind[airquality_filtered$Month==-8])
##
## Shapiro-Wilk normality test
##
## data: airquality_filtered$Wind[airquality_filtered$Month == -8]
## W = 0.98533, p-value = 0.937
# Equal variance test to decide - pooled t-test or satterthwaite t-test?
var.test(Wind ~ Month, airquality_filtered,
alternative = "two.sided")
##
## F test to compare two variances
##
## data: Wind by Month
## F = 1.129, num df = 30, denom df = 30, p-value = 0.7418
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5443957 2.3415780
## sample estimates:
## ratio of variances
## 1.129046
bartlett.test(Wind ~ Month, airquality_filtered)
##
## Bartlett test of homogeneity of variances
##
## data: Wind by Month
## Bartlett's K-squared = 0.10861, df = 1, p-value = 0.7417
# parametric t-test
t.test(Wind ~ Month, airquality_filtered,
alternative = "two.sided",var.equal=TRUE)
##
## Two Sample t-test
##
## data: Wind by Month
## t = 0.1865, df = 60, p-value = 0.8527
## alternative hypothesis: true difference in means between group -8 and group -7 is not equal to 0
## 95 percent confidence interval:
## -1.443108 1.739883
## sample estimates:
## mean in group -8 mean in group -7
## -8.793548 -8.941935