##Pre processing Basic data pre processing Also set the working directory. Then read the data

cars <- read.csv("cars.csv")
summary(cars)
##      Sl..No         mpg          cylinders     cubicinches          hp       
##  Min.   :  1   Min.   :10.00   Min.   :3.00   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.: 66   1st Qu.:16.90   1st Qu.:4.00   1st Qu.:101.0   1st Qu.: 75.0  
##  Median :131   Median :22.00   Median :6.00   Median :156.0   Median : 95.0  
##  Mean   :131   Mean   :23.14   Mean   :5.59   Mean   :201.1   Mean   :106.4  
##  3rd Qu.:196   3rd Qu.:28.80   3rd Qu.:8.00   3rd Qu.:302.0   3rd Qu.:138.0  
##  Max.   :261   Max.   :46.60   Max.   :8.00   Max.   :455.0   Max.   :230.0  
##    weightlbs      time.to.60         year         brand          
##  Min.   :1613   Min.   : 8.00   Min.   :1971   Length:261        
##  1st Qu.:2246   1st Qu.:14.00   1st Qu.:1974   Class :character  
##  Median :2835   Median :16.00   Median :1977   Mode  :character  
##  Mean   :3005   Mean   :15.55   Mean   :1977                     
##  3rd Qu.:3664   3rd Qu.:17.00   3rd Qu.:1980                     
##  Max.   :4997   Max.   :25.00   Max.   :1983

GRAPHICAL METHODS FOR IDENTIFYING OUTLIERS

Set up the plot area Create the histogram bars Trying to examine the presence of outliers using histogram, an informal way of finding the outliers

hist(cars$weightlbs,
     breaks = 30,
     xlim = c(1500, 5000),
     col = "blue",
     border = "black",
     ylim = c(0, 20),
     xlab = "Weight",
     ylab = "Counts",
     main = "Histogram
of Car Weights")

##examining scaller plot for outliers

plot(cars$weightlbs,
     cars$mpg,
     xlim = c(1500, 5000),
     ylim = c(0, 50),
     xlab = "Weight",
     ylab = "MPG",
     main = "Scatterplot of MPG by Weight",
     type = "p",
     pch = 20,
     col = "blue")

###Results - Scatter Plot Majority of data is around the mean and there is no specific outliers in data between mpg and weight.

##box plot Examining outliers using box plot

boxplot(mpg ~ cylinders, data = cars, xlab = "Number of Cylinders",
        ylab = "Miles Per Gallon", main = "Mileage Data")

box <- boxplot(mpg ~ brand, data = cars, xlab = "brand",
        ylab = "Miles Per Gallon", main = "Mileage Data")

##box plot for only mileage

box <- boxplot(cars$mpg,  xlab = "box",
               ylab = "Miles Per Gallon", main = "Mileage Data")$out

##New data for outliers creating data with outliers

cars1 <- edit(cars)

##$out is used to get the outlier values. Can see in the environment

box1 <- boxplot(cars1$mpg,  xlab = "box",
               ylab = "Miles Per Gallon", main = "Mileage Data")$out

##To get only the outlier value without plot

box2 <- boxplot(cars1$mpg, plot = F)$out

##For identifying complete row having outlier value

outliers_data <- cars1[which(cars1$mpg %in% box2),]
outliers_data
## [1] Sl..No      mpg         cylinders   cubicinches hp          weightlbs  
## [7] time.to.60  year        brand      
## <0 rows> (or 0-length row.names)
View(cars1)

##Downloading the data

write.csv(outliers_data, "outliers_cars.csv")

##Deleting the outliers data. To include “-which” instead of “which”

cars2 <- cars1[-which(cars1$mpg %in% box2),]

##View(cars2)

##data2 <- cars[-which(cars$mpg %in% box),]