##Pre processing Basic data pre processing Also set the working directory. Then read the data
cars <- read.csv("cars.csv")
summary(cars)
## Sl..No mpg cylinders cubicinches hp
## Min. : 1 Min. :10.00 Min. :3.00 Min. : 68.0 Min. : 46.0
## 1st Qu.: 66 1st Qu.:16.90 1st Qu.:4.00 1st Qu.:101.0 1st Qu.: 75.0
## Median :131 Median :22.00 Median :6.00 Median :156.0 Median : 95.0
## Mean :131 Mean :23.14 Mean :5.59 Mean :201.1 Mean :106.4
## 3rd Qu.:196 3rd Qu.:28.80 3rd Qu.:8.00 3rd Qu.:302.0 3rd Qu.:138.0
## Max. :261 Max. :46.60 Max. :8.00 Max. :455.0 Max. :230.0
## weightlbs time.to.60 year brand
## Min. :1613 Min. : 8.00 Min. :1971 Length:261
## 1st Qu.:2246 1st Qu.:14.00 1st Qu.:1974 Class :character
## Median :2835 Median :16.00 Median :1977 Mode :character
## Mean :3005 Mean :15.55 Mean :1977
## 3rd Qu.:3664 3rd Qu.:17.00 3rd Qu.:1980
## Max. :4997 Max. :25.00 Max. :1983
Set up the plot area Create the histogram bars Trying to examine the presence of outliers using histogram, an informal way of finding the outliers
hist(cars$weightlbs,
breaks = 30,
xlim = c(1500, 5000),
col = "blue",
border = "black",
ylim = c(0, 20),
xlab = "Weight",
ylab = "Counts",
main = "Histogram
of Car Weights")
##examining scaller plot for outliers
plot(cars$weightlbs,
cars$mpg,
xlim = c(1500, 5000),
ylim = c(0, 50),
xlab = "Weight",
ylab = "MPG",
main = "Scatterplot of MPG by Weight",
type = "p",
pch = 20,
col = "blue")
###Results - Scatter Plot Majority of data is around the mean and there is no specific outliers in data between mpg and weight.
##box plot Examining outliers using box plot
boxplot(mpg ~ cylinders, data = cars, xlab = "Number of Cylinders",
ylab = "Miles Per Gallon", main = "Mileage Data")
box <- boxplot(mpg ~ brand, data = cars, xlab = "brand",
ylab = "Miles Per Gallon", main = "Mileage Data")
##box plot for only mileage
box <- boxplot(cars$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")$out
##New data for outliers creating data with outliers
cars1 <- edit(cars)
##$out is used to get the outlier values. Can see in the environment
box1 <- boxplot(cars1$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")$out
##To get only the outlier value without plot
box2 <- boxplot(cars1$mpg, plot = F)$out
##For identifying complete row having outlier value
outliers_data <- cars1[which(cars1$mpg %in% box2),]
outliers_data
## [1] Sl..No mpg cylinders cubicinches hp weightlbs
## [7] time.to.60 year brand
## <0 rows> (or 0-length row.names)
View(cars1)
##Downloading the data
write.csv(outliers_data, "outliers_cars.csv")
##Deleting the outliers data. To include “-which” instead of “which”
cars2 <- cars1[-which(cars1$mpg %in% box2),]
##View(cars2)
##data2 <- cars[-which(cars$mpg %in% box),]