#Read in Data and do quick checks to understand data
library(ggplot2)
cars_outliers = read.table("C:\\Users\\oghen\\Desktop\\cars_outliers.txt",header=T,sep = ",")
#head(cars_outliers)
str(cars_outliers)
## 'data.frame': 261 obs. of 8 variables:
## $ mpg : num 14 31.9 17 15 30.5 23 13 14 25.4 37.7 ...
## $ cylinders : int 8 4 8 8 4 8 8 8 5 4 ...
## $ cubicinches: int 350 89 302 400 98 350 351 440 183 89 ...
## $ hp : int 165 71 140 150 63 125 158 215 77 62 ...
## $ weightlbs : num 4209 1925 3449 3761 2051 ...
## $ time.to.60 : int 12 14 11 10 17 17 13 9 20 17 ...
## $ year : int 1972 1980 1971 1971 1978 1980 1974 1971 1980 1982 ...
## $ brand : Factor w/ 5 levels " Europe."," France.",..: 4 1 4 4 4 4 4 4 1 3 ...
Fix the brand variable by making data consistent
#Use table to check distribution and then assign the various levels accordingly
table(cars_outliers$brand)
##
## Europe. France. Japan. US. USA.
## 47 1 51 161 1
levels(cars_outliers$brand)
## [1] " Europe." " France." " Japan." " US." " USA."
levels(cars_outliers$brand)= c("Europe","France","Japan","USA","USA")
#Confirm spread of brands after fixing.
table(cars_outliers$brand)
##
## Europe France Japan USA
## 47 1 51 162
ggplot(data=cars_outliers, mapping = aes(brand)) +
geom_bar()+
ggtitle("Bar Plot of number of brands/Country")+
labs(x="Country", y="No of Brands")
##########################################################################################################################################################################################################################
#MPG
##########################################################################################################################################################################################################################
# Find outlier in mpg
#Use qqplot to indentify outlier
qqnorm(cars_outliers$mpg)
qqline(cars_outliers$mpg)
#One outlier found
#Confirm result using boxplot and histogram.
boxplot(cars_outliers$mpg)
hist(cars_outliers$mpg)
#From both plots, it appears that an outlier exist at mpg of 500.
summary(cars_outliers$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 16.90 22.00 24.99 29.00 500.00
#IQR
IQR= 29.00 - 16.90
lower_outlier= 16.90 - 1.5*(IQR)
upper_outlier= 29.00 + 1.5*(IQR)
bounds= c(lower_outlier,upper_outlier)
bounds
## [1] -1.25 47.15
cars_outliers[!(cars_outliers$mpg>=lower_outlier & cars_outliers$mpg<=upper_outlier) ,]
## mpg cylinders cubicinches hp weightlbs time.to.60 year brand
## 43 500 6 250 110 3645 16 1977 USA
#Outlier: mpg==500 is confirmed.
#Its virtually impossible to travel 500 miles with one gallon.There are multiple options to fix this issue- one is to remove this data point. Another way is to extrapolate base on the mpg for cars in 1977 in America. F
##########################################################################################################################################################################################################################
#Weightlbs
##########################################################################################################################################################################################################################
# Find Outlier in weightlbs
#Use qqplot to indentify outlier
qqnorm(cars_outliers$weightlbs)
qqline(cars_outliers$weightlbs)
#There appear to be several outliers using the qqplot but lets confirm or reject by checking THE hist plot and boxplot.
#Confirm result using geom_boxplot
boxplot(cars_outliers$weightlbs)
hist(cars_outliers$weightlbs)
#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$weightlbs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 192.5 2246.0 2835.0 2991.4 3664.0 4997.0
IQR_weightlbs= 3664.0-2246.0
lower_outlier_weightlbs=2246.0 - 1.5*(IQR_weightlbs)
upper_outlier_Weightlbs=3664.0 + 1.5*(IQR_weightlbs)
bounds_weightlbs=c(lower_outlier_weightlbs,upper_outlier_Weightlbs)
bounds_weightlbs
## [1] 119 5791
cars_outliers[!(cars_outliers$weightlbs>=lower_outlier_weightlbs & cars_outliers$weightlbs<=upper_outlier_Weightlbs) ,]
## [1] mpg cylinders cubicinches hp weightlbs time.to.60
## [7] year brand
## <0 rows> (or 0-length row.names)
#No outliers found associated with weight.
##########################################################################################################################################################################################################################
#Cyclinders
##########################################################################################################################################################################################################################
#head(cars_outliers$cylinders)
#summary(cars_outliers$cylinders)
##3. Find Outlier in cylinders using a box plot and histogram
boxplot(cars_outliers$cylinders)
hist(cars_outliers$cylinders)
#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$cylinders)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 4.00 6.00 5.59 8.00 8.00
IQR_cylinder= 8-4
lower_outlier_cylinder=4 - 1.5*(IQR_cylinder)
upper_outlier_cylinder=8 + 1.5*(IQR_cylinder)
bounds_cylinder=c(lower_outlier_cylinder,upper_outlier_cylinder)
print(bounds_cylinder)
## [1] -2 14
cars_outliers[!(cars_outliers$cylinders>=lower_outlier_cylinder & cars_outliers$cylinders<=upper_outlier_cylinder) ,]
## [1] mpg cylinders cubicinches hp weightlbs time.to.60
## [7] year brand
## <0 rows> (or 0-length row.names)
#No outliers found associated with no of cylinders.
##########################################################################################################################################################################################################################
#Cubicinches
##########################################################################################################################################################################################################################
# Find Outlier in cubicinches
boxplot(cars_outliers$cubicinches)
hist(cars_outliers$cubicinches)
#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$cubicinches)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 101.0 156.0 201.1 302.0 455.0
IQR_cubicinches= 302.0-101.0
lower_outlier_cubicinches=101.0 - 1.5*(IQR_weightlbs)
upper_outlier_cubicinches=302.0 + 1.5*(IQR_weightlbs)
bounds_cubicinches=c(lower_outlier_cubicinches,upper_outlier_cubicinches)
bounds_cubicinches
## [1] -2026 2429
cars_outliers[!(cars_outliers$cubicinches>=lower_outlier_cubicinches & cars_outliers$cubicinches<=upper_outlier_cubicinches) ,]
## [1] mpg cylinders cubicinches hp weightlbs time.to.60
## [7] year brand
## <0 rows> (or 0-length row.names)
#No outliers found associated with cubicinches.
##########################################################################################################################################################################################################################
#Hp
##########################################################################################################################################################################################################################
## Find Outlier in Hp
boxplot(cars_outliers$hp)
hist(cars_outliers$hp)
#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 46.0 75.0 95.0 106.4 138.0 230.0
IQR_hp= 138.0-75.0
lower_outlier_hp=75.0- 1.5*(IQR_hp)
upper_outlier_hp=138.0 + 1.5*(IQR_hp)
bounds_hp=c(lower_outlier_hp,upper_outlier_hp)
bounds_hp
## [1] -19.5 232.5
cars_outliers[!(cars_outliers$hp>=lower_outlier_hp & cars_outliers$hp<=upper_outlier_hp) ,]
## [1] mpg cylinders cubicinches hp weightlbs time.to.60
## [7] year brand
## <0 rows> (or 0-length row.names)
#No outliers found associated with hp.
##########################################################################################################################################################################################################################
#Time to 60
##########################################################################################################################################################################################################################
boxplot(cars_outliers$time.to.60)
#Several outliers found
summary(cars_outliers$time.to.60)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 14.00 16.00 15.55 17.00 25.00
IQR_time.to.60= 17.0-14.0
lower_outlier_time.to.60=14.0- 1.5*(IQR_time.to.60)
upper_outlier_time.to.60=17.0 + 1.5*(IQR_time.to.60)
bounds_time.to.60=c(lower_outlier_time.to.60,upper_outlier_time.to.60)
bounds_time.to.60
## [1] 9.5 21.5
cars_outliers[!(cars_outliers$time.to.60>=lower_outlier_time.to.60 & cars_outliers$time.to.60<=upper_outlier_time.to.60) ,]
## mpg cylinders cubicinches hp weightlbs time.to.60 year brand
## 8 14.0 8 440 215 4312 9 1971 USA
## 52 27.2 4 141 71 3190 25 1980 Europe
## 53 14.0 8 340 160 3609 8 1971 USA
## 107 43.4 4 90 48 2335 24 1981 Europe
## 165 23.0 4 97 54 2254 24 1973 Europe
## 198 43.1 4 90 48 1985 22 1979 Europe
## 203 23.9 8 260 90 3420 22 1980 USA
## 206 15.0 8 390 190 3850 9 1971 USA
## 236 44.0 4 97 52 2130 25 1983 Europe
## 245 14.0 8 454 220 4354 9 1971 USA
## 248 44.3 4 90 48 2085 22 1981 Europe
#A total of 11 outliers found as seen in dataframe.
##########################################################################################################################################################################################################################
#year
##########################################################################################################################################################################################################################
boxplot(cars_outliers$year)
#No outliers found
summary(cars_outliers$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1971 1974 1977 1977 1980 1983
IQR_year= 1980-1974
lower_outlier_year=1974 - 1.5*(IQR_year)
upper_outlier_year=1980 + 1.5*(IQR_year)
bounds_year=c(lower_outlier_year,upper_outlier_year)
bounds_year
## [1] 1965 1989
cars_outliers[!(cars_outliers$year>=lower_outlier_year & cars_outliers$year<=upper_outlier_year) ,]
## [1] mpg cylinders cubicinches hp weightlbs time.to.60
## [7] year brand
## <0 rows> (or 0-length row.names)
#No outliers found.
#To truly find a perfect solution, year will need to be converted to a different unit but its probably beyond the scope of this assignment.