#Read in Data and do quick checks to understand data

library(ggplot2)
cars_outliers = read.table("C:\\Users\\oghen\\Desktop\\cars_outliers.txt",header=T,sep = ",")
#head(cars_outliers)
str(cars_outliers)
## 'data.frame':    261 obs. of  8 variables:
##  $ mpg        : num  14 31.9 17 15 30.5 23 13 14 25.4 37.7 ...
##  $ cylinders  : int  8 4 8 8 4 8 8 8 5 4 ...
##  $ cubicinches: int  350 89 302 400 98 350 351 440 183 89 ...
##  $ hp         : int  165 71 140 150 63 125 158 215 77 62 ...
##  $ weightlbs  : num  4209 1925 3449 3761 2051 ...
##  $ time.to.60 : int  12 14 11 10 17 17 13 9 20 17 ...
##  $ year       : int  1972 1980 1971 1971 1978 1980 1974 1971 1980 1982 ...
##  $ brand      : Factor w/ 5 levels " Europe."," France.",..: 4 1 4 4 4 4 4 4 1 3 ...

Fix the brand variable by making data consistent

#Use table to check distribution and then assign the various levels accordingly
table(cars_outliers$brand)
## 
##  Europe.  France.   Japan.      US.     USA. 
##       47        1       51      161        1
levels(cars_outliers$brand)
## [1] " Europe." " France." " Japan."  " US."     " USA."
levels(cars_outliers$brand)= c("Europe","France","Japan","USA","USA")

#Confirm spread of brands after fixing.
table(cars_outliers$brand)
## 
## Europe France  Japan    USA 
##     47      1     51    162
ggplot(data=cars_outliers, mapping = aes(brand)) +
  geom_bar()+
  ggtitle("Bar Plot of number of brands/Country")+
  labs(x="Country", y="No of Brands")

##########################################################################################################################################################################################################################

                                        #MPG


##########################################################################################################################################################################################################################


# Find outlier in mpg
#Use qqplot to indentify outlier
qqnorm(cars_outliers$mpg)
qqline(cars_outliers$mpg)

#One outlier found

#Confirm result using boxplot and histogram.
boxplot(cars_outliers$mpg)

hist(cars_outliers$mpg)

#From both plots, it appears that an outlier exist at mpg of 500.
summary(cars_outliers$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   16.90   22.00   24.99   29.00  500.00
#IQR 
IQR= 29.00 - 16.90
lower_outlier= 16.90 - 1.5*(IQR)
upper_outlier= 29.00 + 1.5*(IQR)
bounds= c(lower_outlier,upper_outlier)
bounds
## [1] -1.25 47.15
cars_outliers[!(cars_outliers$mpg>=lower_outlier & cars_outliers$mpg<=upper_outlier) ,]
##    mpg cylinders cubicinches  hp weightlbs time.to.60 year brand
## 43 500         6         250 110      3645         16 1977   USA
#Outlier: mpg==500 is confirmed.
#Its virtually impossible to travel 500 miles with one gallon.There are multiple options to fix this issue- one is to remove this data point. Another way is to extrapolate base on the mpg for cars in 1977 in America. F
##########################################################################################################################################################################################################################

                                        #Weightlbs


##########################################################################################################################################################################################################################


# Find Outlier in weightlbs
#Use qqplot to indentify outlier
qqnorm(cars_outliers$weightlbs)
qqline(cars_outliers$weightlbs)

#There appear to be several outliers using the qqplot but lets confirm or reject by checking THE hist plot and boxplot. 

#Confirm result using geom_boxplot
boxplot(cars_outliers$weightlbs)

hist(cars_outliers$weightlbs)

#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.


summary(cars_outliers$weightlbs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   192.5  2246.0  2835.0  2991.4  3664.0  4997.0
IQR_weightlbs= 3664.0-2246.0
lower_outlier_weightlbs=2246.0 - 1.5*(IQR_weightlbs)
upper_outlier_Weightlbs=3664.0 + 1.5*(IQR_weightlbs)
bounds_weightlbs=c(lower_outlier_weightlbs,upper_outlier_Weightlbs)
bounds_weightlbs
## [1]  119 5791
cars_outliers[!(cars_outliers$weightlbs>=lower_outlier_weightlbs & cars_outliers$weightlbs<=upper_outlier_Weightlbs) ,]
## [1] mpg         cylinders   cubicinches hp          weightlbs   time.to.60 
## [7] year        brand      
## <0 rows> (or 0-length row.names)
#No outliers found associated with weight.
##########################################################################################################################################################################################################################

                                        #Cyclinders

##########################################################################################################################################################################################################################

#head(cars_outliers$cylinders)
#summary(cars_outliers$cylinders)
##3. Find Outlier in cylinders using a box plot and histogram
boxplot(cars_outliers$cylinders)

hist(cars_outliers$cylinders)

#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$cylinders)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    4.00    6.00    5.59    8.00    8.00
IQR_cylinder= 8-4
lower_outlier_cylinder=4 - 1.5*(IQR_cylinder)
upper_outlier_cylinder=8 + 1.5*(IQR_cylinder)
bounds_cylinder=c(lower_outlier_cylinder,upper_outlier_cylinder)
print(bounds_cylinder)
## [1] -2 14
cars_outliers[!(cars_outliers$cylinders>=lower_outlier_cylinder & cars_outliers$cylinders<=upper_outlier_cylinder) ,]
## [1] mpg         cylinders   cubicinches hp          weightlbs   time.to.60 
## [7] year        brand      
## <0 rows> (or 0-length row.names)
#No outliers found associated with no of cylinders.
##########################################################################################################################################################################################################################

                                        #Cubicinches


##########################################################################################################################################################################################################################
# Find Outlier in cubicinches
boxplot(cars_outliers$cubicinches)

hist(cars_outliers$cubicinches)

#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.
summary(cars_outliers$cubicinches)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   101.0   156.0   201.1   302.0   455.0
IQR_cubicinches= 302.0-101.0
lower_outlier_cubicinches=101.0 - 1.5*(IQR_weightlbs)
upper_outlier_cubicinches=302.0 + 1.5*(IQR_weightlbs)
bounds_cubicinches=c(lower_outlier_cubicinches,upper_outlier_cubicinches)
bounds_cubicinches
## [1] -2026  2429
cars_outliers[!(cars_outliers$cubicinches>=lower_outlier_cubicinches & cars_outliers$cubicinches<=upper_outlier_cubicinches) ,]
## [1] mpg         cylinders   cubicinches hp          weightlbs   time.to.60 
## [7] year        brand      
## <0 rows> (or 0-length row.names)
#No outliers found associated with cubicinches.
##########################################################################################################################################################################################################################

                                        #Hp


##########################################################################################################################################################################################################################

## Find Outlier in Hp
 
boxplot(cars_outliers$hp)

hist(cars_outliers$hp)

#I couldn't identify outliers using a box plot and histogram. Lets use the Outlier test.



summary(cars_outliers$hp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    46.0    75.0    95.0   106.4   138.0   230.0
IQR_hp= 138.0-75.0
lower_outlier_hp=75.0- 1.5*(IQR_hp)
upper_outlier_hp=138.0 + 1.5*(IQR_hp)
bounds_hp=c(lower_outlier_hp,upper_outlier_hp)
bounds_hp
## [1] -19.5 232.5
cars_outliers[!(cars_outliers$hp>=lower_outlier_hp & cars_outliers$hp<=upper_outlier_hp) ,]
## [1] mpg         cylinders   cubicinches hp          weightlbs   time.to.60 
## [7] year        brand      
## <0 rows> (or 0-length row.names)
#No outliers found associated with hp.
##########################################################################################################################################################################################################################

                                        #Time to 60


##########################################################################################################################################################################################################################

boxplot(cars_outliers$time.to.60)

#Several outliers found


summary(cars_outliers$time.to.60)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   14.00   16.00   15.55   17.00   25.00
IQR_time.to.60= 17.0-14.0
lower_outlier_time.to.60=14.0- 1.5*(IQR_time.to.60)
upper_outlier_time.to.60=17.0 + 1.5*(IQR_time.to.60)
bounds_time.to.60=c(lower_outlier_time.to.60,upper_outlier_time.to.60)
bounds_time.to.60
## [1]  9.5 21.5
cars_outliers[!(cars_outliers$time.to.60>=lower_outlier_time.to.60 & cars_outliers$time.to.60<=upper_outlier_time.to.60) ,]
##      mpg cylinders cubicinches  hp weightlbs time.to.60 year  brand
## 8   14.0         8         440 215      4312          9 1971    USA
## 52  27.2         4         141  71      3190         25 1980 Europe
## 53  14.0         8         340 160      3609          8 1971    USA
## 107 43.4         4          90  48      2335         24 1981 Europe
## 165 23.0         4          97  54      2254         24 1973 Europe
## 198 43.1         4          90  48      1985         22 1979 Europe
## 203 23.9         8         260  90      3420         22 1980    USA
## 206 15.0         8         390 190      3850          9 1971    USA
## 236 44.0         4          97  52      2130         25 1983 Europe
## 245 14.0         8         454 220      4354          9 1971    USA
## 248 44.3         4          90  48      2085         22 1981 Europe
#A total of 11 outliers found as seen in dataframe.
##########################################################################################################################################################################################################################

                                        #year


##########################################################################################################################################################################################################################


boxplot(cars_outliers$year)

#No outliers found


summary(cars_outliers$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1971    1974    1977    1977    1980    1983
IQR_year= 1980-1974
lower_outlier_year=1974 - 1.5*(IQR_year)
upper_outlier_year=1980 + 1.5*(IQR_year)
bounds_year=c(lower_outlier_year,upper_outlier_year)
bounds_year
## [1] 1965 1989
cars_outliers[!(cars_outliers$year>=lower_outlier_year & cars_outliers$year<=upper_outlier_year) ,]
## [1] mpg         cylinders   cubicinches hp          weightlbs   time.to.60 
## [7] year        brand      
## <0 rows> (or 0-length row.names)
#No outliers found.

#To truly find a perfect solution, year will need to be converted to a different unit but its probably beyond the scope of this assignment.