here we are trying to some preprocessing
cars <- read.csv("cars.csv")
summary(cars)
## Sl..No mpg cylinders cubicinches hp
## Min. : 1 Min. :10.00 Min. :3.00 Min. : 68.0 Min. : 46.0
## 1st Qu.: 66 1st Qu.:16.90 1st Qu.:4.00 1st Qu.:101.0 1st Qu.: 75.0
## Median :131 Median :22.00 Median :6.00 Median :156.0 Median : 95.0
## Mean :131 Mean :23.14 Mean :5.59 Mean :201.1 Mean :106.4
## 3rd Qu.:196 3rd Qu.:28.80 3rd Qu.:8.00 3rd Qu.:302.0 3rd Qu.:138.0
## Max. :261 Max. :46.60 Max. :8.00 Max. :455.0 Max. :230.0
## weightlbs time.to.60 year brand
## Min. :1613 Min. : 8.00 Min. :1971 Length:261
## 1st Qu.:2246 1st Qu.:14.00 1st Qu.:1974 Class :character
## Median :2835 Median :16.00 Median :1977 Mode :character
## Mean :3005 Mean :15.55 Mean :1977
## 3rd Qu.:3664 3rd Qu.:17.00 3rd Qu.:1980
## Max. :4997 Max. :25.00 Max. :1983
Here we are trying to examine the outlierby using Histogram
par(mfrow = c(1,3))
summary(cars$weightlbs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1613 2246 2835 3005 3664 4997
hist(cars$weightlbs,
breaks = 30,
xlim = c(1500, 5000),
col = "blue",
border = "black",
ylim = c(0, 20),
xlab = "Weight",
ylab = "Counts",
main = "Histogram
of Car Weights")
Here we are examining the outlierby using the scatter plot
plot(cars$weightlbs,
cars$mpg,
xlim = c(1500, 5000),
ylim = c(0, 50),
xlab = "Weight",
ylab = "MPG",
main = "Scatterplot of MPG by Weight",
type = "p",
pch = 20,
col = "blue")
from the scatter plot,wehave observed that majority of data lineswith in the range and not outlier
boxplot(mpg ~ cylinders, data = cars, xlab = "Number of Cylinders",
ylab = "Miles Per Gallon", main = "Mileage Data")
box <- boxplot(mpg ~ brand, data = cars, xlab = "brand",
ylab = "Miles Per Gallon", main = "Mileage Data")
box <- boxplot(cars$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")$out
##Creating a data with outlier
cars1 <- edit(cars)
cars1 <- cars1
boxplot(cars1$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")
box1 <- boxplot(cars1$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")$out
outliers_data<- cars1[which(cars1$mpg %in% box1),]
outliers_data
## Sl..No mpg cylinders cubicinches hp weightlbs time.to.60 year brand
## 8 8 86 8 440 215 4312 9 1971 US.
## 19 19 76 4 113 95 2278 16 1973 Japan.
cars2 <- cars1[-which(cars1$mpg %in% box1),]