Homework 1
install.packages(“dplyr”) output: html_document: default
library(e1071) library(fBasics) library(tidyverse) library(devtools) library(dplyr)
Exercise 1
MPGCombo = (c(CARS\(MPG_City * 0.4))+(c(CARS\)MPG_Highway * 0.6))
CARS = data.frame(CARS, MPGCombo)
boxplot(MPGCombo, main = “MPG Combined (Highway and City)”, ylab = “Miles per Gallon”, xlab = “40% City and 60% Highway Observations”)
unique(CARS$Type) # [1] “SUV” “Sedan” “Sports” “Wagon” “Truck”
ggplot(data = CARS) + geom_boxplot(mapping = aes(y = MPGCombo)) + facet_wrap(~ Type, nrow = 2)
summary(CARS$Horsepower)
hist(CARS\(Horsepower, main = "Histogram of Horsepower", xlab = "Horsepower") skewness(CARS\)Horsepower)
ggplot(data = CARS) + geom_histogram(mapping = aes(x = Horsepower)) + facet_wrap(~ Type, nrow = 2)+ labs(main = “Horsepower by Type of car”)
shapiro.test(CARS$Horsepower)
unique(CARS$Type) # [1] “SUV” “Sedan” “Sports” “Wagon” “Truck”
Sedan = CARS[CARS$Type==“Sedan”,“Horsepower”] SUV = CARS[CARS$Type==“SUV”,“Horsepower”] Sports = CARS[CARS$Type==“Sports”,“Horsepower”] Wagon = CARS[CARS$Type==“Wagon”,“Horsepower”] Truck = CARS[CARS$Type==“Truck”,“Horsepower”]
shapiro.test(Sedan$Horsepower) # W = 0.95154, p-value = 1.205e-07 # Since p-value is < 0.05 (alpha), the data doesn’t follow a normality distribution.
shapiro.test(Sports$Horsepower) # W = 0.94276, p-value = 0.01898 # Since p-value is < 0.05 (alpha), the data doesn’t follow a normality distribution.
shapiro.test(SUV$Horsepower) # W = 0.95945, p-value = 0.04423 # Even though p-value is less than 0.05, we could be flexible and assume that the # data follows a distribution very close to a normal one (also visually we can # see the same.)
shapiro.test(Truck$Horsepower) # W = 0.8951, p-value = 0.01697 # # Since p-value is < 0.05 (alpha), the data doesn’t follow a normality distribution.
shapiro.test(Wagon$Horsepower) # W = 0.94074, p-value = 0.09525 # This data is the only one from the Type of cars, that could follow a normal # distribution (p-value > 0.05), even when visually it is not very clear the # normal distribution.
Exercise 2
wilcox.test(SUV,Truck, alternative = “two.sided”)
Exercise 3
View(airquality)
data.frame(airquality) summary(airquality)
month_July = subset(airquality, Month==7) month_August = subset(airquality, Month==8)
par(mfrow=c(1,2)) hist(month_July\(Wind, main = "Wind for July", xlab = "Wind") hist(month_August\)Wind, main = “Wind for August”, xlab = “Wind”)
summary(month_July$Wind) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 4.100 6.900 8.600 8.942 10.900 14.900
summary(month_August$Wind) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 2.300 6.600 8.600 8.794 11.200 15.500
par(mfrow=c(1,2))
qqnorm(month_July\(Wind, main = "Q-Q Plot July") qqnorm(month_August\)Wind, main = “Q-Q Plot August”)
par(mfrow=c(1,1))
shapiro.test(month_July\(Wind) # data: month_July\)Wind # W = 0.95003, p-value = 0.1564
shapiro.test(month_August\(Wind) # data: month_August\)Wind # W = 0.98533, p-value = 0.937
t.test(month_July\(Wind, month_August\)Wind, alternative = “two.sided”)
Comments = The mean is around 22mpg (approx), and it is expected from combining