##Pre-Processing in R Here , we are trying to do basic data pre processing.
setwd(dir="C:\\Users\\pahar\\OneDrive\\IIM-K\\Classes\\2022-07-30")
cars <- read.csv ("cars.csv")
summary(cars)
## Sl..No mpg cylinders cubicinches hp
## Min. : 1 Min. :10.00 Min. :3.00 Min. : 68.0 Min. : 46.0
## 1st Qu.: 66 1st Qu.:16.90 1st Qu.:4.00 1st Qu.:101.0 1st Qu.: 75.0
## Median :131 Median :22.00 Median :6.00 Median :156.0 Median : 95.0
## Mean :131 Mean :23.14 Mean :5.59 Mean :201.1 Mean :106.4
## 3rd Qu.:196 3rd Qu.:28.80 3rd Qu.:8.00 3rd Qu.:302.0 3rd Qu.:138.0
## Max. :261 Max. :46.60 Max. :8.00 Max. :455.0 Max. :230.0
## weightlbs time.to.60 year brand
## Min. :1613 Min. : 8.00 Min. :1971 Length:261
## 1st Qu.:2246 1st Qu.:14.00 1st Qu.:1974 Class :character
## Median :2835 Median :16.00 Median :1977 Mode :character
## Mean :3005 Mean :15.55 Mean :1977
## 3rd Qu.:3664 3rd Qu.:17.00 3rd Qu.:1980
## Max. :4997 Max. :25.00 Max. :1983
str(cars)
## 'data.frame': 261 obs. of 9 variables:
## $ Sl..No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ mpg : num 14 31.9 17 15 30.5 23 13 14 25.4 37.7 ...
## $ cylinders : int 8 4 8 8 4 8 8 8 5 4 ...
## $ cubicinches: int 350 89 302 400 98 350 351 440 183 89 ...
## $ hp : int 165 71 140 150 63 125 158 215 77 62 ...
## $ weightlbs : int 4209 1925 3449 3761 2051 3900 4363 4312 3530 2050 ...
## $ time.to.60 : int 12 14 11 10 17 17 13 9 20 17 ...
## $ year : int 1972 1980 1971 1971 1978 1980 1974 1971 1980 1982 ...
## $ brand : chr " US." " Europe." " US." " US." ...
Here we are trying to examine the presence of outlier using histogram, which is very informal
#par(mfrow = c(1,3))
hist(cars$weightlbs,
breaks = 30,
xlim = c(1500, 5000),
col = "blue",
border = "black",
ylim = c(0, 20),
xlab = "Weight",
ylab = "Counts",
main = "Histogram
of Car Weights")
Here we are examining scatter plot to explore the presence of outlier
plot(cars$weightlbs,
cars$mpg,
xlim = c(1500, 5000),
ylim = c(0, 50),
xlab = "Weight",
ylab = "MPG",
main = "Scatterplot of MPG by Weight",
type = "p",
pch = 20,
col = "blue")
From the scatter plot, we found that the majority of the data points hovered around the mean, there is no much evidence of outlier between MPG and weight of the cars
Here we create box plots to identify the outliers
boxplot(mpg ~ cylinders, data = cars, xlab = "Number of Cylinders",
ylab = "Miles Per Gallon", main = "Mileage Data")
box <- boxplot(mpg ~ brand, data = cars, xlab = "brand",
ylab = "Miles Per Gallon", main = "Mileage Data")
##Box plot for only mileage
box <- boxplot(cars$mpg, xlab = "box",
ylab = "Miles Per Gallon", main = "Mileage Data")$out
#Box plot for cars1 data
#box1 <- boxplot(cars1$mpg)
#New Data Creating a data with outlier
cars1<-edit(cars)
cars1<-cars1
#box1 <- boxplot(cars1$mpg, xlab = "box",
#ylab = "Miles Per Gallon", main = "Mileage Data")$out
box1 <- boxplot(cars1$mpg)$out
outliers_data <- cars1[which(cars1$mpg %in% box1),]
outliers_data
## Sl..No mpg cylinders cubicinches hp weightlbs time.to.60 year brand
## 8 8 56 8 440 215 4312 9 1971 US.
## 19 19 59 4 113 95 2278 16 1973 Japan.
write.csv(outliers_data,"outlier.csv")
#data2 <- cars1[-which(cars1$mpg %in% box),]
#deleting outlier Here we are deleting outlier from original data i.e. cars1
cars2<-cars1[-which(cars1$mpg %in% box1),]