#ANALYSIS OF AUTO-MPG DATASET ##BY Palepu Venkata Hemanth - 20MIC0105
#A. LINEAR REGRESSION
1. import dataset
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset=read.table(url,header=F,col.names = c("mpg","cylinders","displacement","horsepower","weight","acceleration","model_year", "origin","car_name"))
dataset$horsepower = ifelse(dataset$horsepower == '?',0,dataset$horsepower)
dataset$horsepower <- as.numeric(dataset$horsepower)
as.integer(sum(dataset$horsepower)/392) -> replace_mean
dataset$horsepower = ifelse(dataset$horsepower == 0,replace_mean,dataset$horsepower)
head(dataset)
## mpg cylinders displacement horsepower weight acceleration model_year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## car_name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
str(dataset)
## 'data.frame': 398 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model_year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ car_name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
glimpse(dataset)
## Rows: 398
## Columns: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ model_year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ car_name <chr> "chevrolet chevelle malibu", "buick skylark 320", "plymou…
View(dataset)
2. SPLITTING THE DATA
split = sample.split(dataset$mpg,SplitRatio = 0.75)
training_set=subset(dataset,split==TRUE)
test_set = subset(dataset,split==FALSE)
dim(training_set)
## [1] 324 9
dim(test_set)
## [1] 74 9
3. regression model
reg = lm(formula = mpg~weight+horsepower+cylinders+displacement+acceleration,data = training_set)
reg
##
## Call:
## lm(formula = mpg ~ weight + horsepower + cylinders + displacement +
## acceleration, data = training_set)
##
## Coefficients:
## (Intercept) weight horsepower cylinders displacement
## 45.348877 -0.006056 -0.036800 -0.173980 0.001188
## acceleration
## 0.058783
summary(reg)
##
## Call:
## lm(formula = mpg ~ weight + horsepower + cylinders + displacement +
## acceleration, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5297 -2.8032 -0.5419 2.3327 15.9618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.348877 3.003676 15.098 < 2e-16 ***
## weight -0.006056 0.001049 -5.772 1.86e-08 ***
## horsepower -0.036800 0.019462 -1.891 0.0595 .
## cylinders -0.173980 0.484887 -0.359 0.7200
## displacement 0.001188 0.011247 0.106 0.9160
## acceleration 0.058783 0.145438 0.404 0.6864
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.441 on 318 degrees of freedom
## Multiple R-squared: 0.6923, Adjusted R-squared: 0.6875
## F-statistic: 143.1 on 5 and 318 DF, p-value: < 2.2e-16
plot(reg,main = "20MIC0105-PALEPU VENKATA HEMANTH")
4. prediction from model
y_pred = predict(reg,newdata = test_set)
y_pred
## 3 5 9 10 11 14 19 32
## 18.654565 18.895450 10.009580 14.614133 17.167934 18.117921 29.483840 28.622410
## 33 34 36 37 41 42 44 46
## 29.669408 25.714480 21.674233 22.279336 14.382457 14.775487 10.142013 23.420542
## 47 57 65 70 83 95 96 97
## 28.704812 31.551533 14.568713 12.294899 26.903082 8.541338 6.883159 15.453070
## 110 114 115 122 125 126 132 134
## 28.776592 26.405190 28.652714 18.878619 16.207906 23.229852 32.461776 19.025312
## 135 136 142 144 151 152 159 160
## 19.627866 19.799549 29.247654 28.822405 27.791196 31.110658 12.429342 11.520532
## 169 172 181 182 193 194 201 203
## 26.783617 25.710847 25.183919 32.969674 21.286115 24.357091 21.323544 22.826489
## 208 211 212 214 215 227 229 239
## 22.901743 23.684323 17.938099 15.186994 16.978602 20.968627 20.766691 30.084379
## 240 243 267 291 292 300 314 348
## 31.246386 25.756715 30.187213 15.439839 18.725742 24.348324 26.273524 31.542636
## 366 372 373 374 378 382 384 392
## 23.779432 27.372424 26.016534 25.048596 31.363830 29.519959 31.278073 28.134681
## 396 397
## 28.506549 27.085891
plot(y_pred)
legend("topright",legend = c("20MIC0105-PALEPU VENKATA HEMANTH"))
5. find RMSE AND R-SQUARED VALUES
RMSE = sqrt(mean((y_pred - test_set$mpg)^2))
RMSE
## [1] 3.517136
summary(reg)$r.squared #r-squared value
## [1] 0.6923146
#B. LOGISTIC REGRESSION
1. import dataset
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset=read.table(url,header=F,col.names = c("mpg","cylinders","displacement","horsepower","weight","acceleration","model_year", "origin","car_name"))
dataset$horsepower = ifelse(dataset$horsepower == '?',0,dataset$horsepower)
dataset$horsepower <- as.numeric(dataset$horsepower)
replace_mean=sum(dataset$horsepower)/392
dataset$horsepower = ifelse(dataset$horsepower == 0,replace_mean,dataset$horsepower)
View(dataset)
dataset = dataset[1:8]
head(dataset)
## mpg cylinders displacement horsepower weight acceleration model_year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
str(dataset)
## 'data.frame': 398 obs. of 8 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model_year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
View(dataset)
2. splitting the dataset
library(caTools)
set.seed(123)
split = sample.split(dataset$mpg,SplitRatio = 0.8)
training_set = subset(dataset,split==TRUE)
test_set = subset(dataset,split==FALSE)
View(training_set)
View(test_set)
dim(training_set)
## [1] 338 8
dim(test_set)
## [1] 60 8
3. feature scalling
training_set[-8] = scale(training_set[-8])
test_set[-8] = scale(test_set[-8])
4. fitting a model
classifier = glm(formula=mpg~weight+horsepower+cylinders+displacement+acceleration,data = training_set,family=gaussian())
classifier
##
## Call: glm(formula = mpg ~ weight + horsepower + cylinders + displacement +
## acceleration, family = gaussian(), data = training_set)
##
## Coefficients:
## (Intercept) weight horsepower cylinders displacement
## 3.416e-16 -6.106e-01 -2.004e-01 -6.696e-03 -3.961e-02
## acceleration
## 1.676e-02
##
## Degrees of Freedom: 337 Total (i.e. Null); 332 Residual
## Null Deviance: 337
## Residual Deviance: 98.97 AIC: 558
plot(classifier,main = '20MIC0105 - PALEPU VENKATA HEMANTH')
summary(classifier)
##
## Call:
## glm(formula = mpg ~ weight + horsepower + cylinders + displacement +
## acceleration, family = gaussian(), data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45017 -0.35959 -0.06621 0.28915 1.99346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.416e-16 2.970e-02 0.000 1.0000
## weight -6.106e-01 9.211e-02 -6.629 1.37e-10 ***
## horsepower -2.004e-01 8.312e-02 -2.411 0.0164 *
## cylinders -6.696e-03 9.606e-02 -0.070 0.9445
## displacement -3.961e-02 1.285e-01 -0.308 0.7582
## acceleration 1.676e-02 4.573e-02 0.366 0.7143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2980903)
##
## Null deviance: 337.000 on 337 degrees of freedom
## Residual deviance: 98.966 on 332 degrees of freedom
## AIC: 558.05
##
## Number of Fisher Scoring iterations: 2
5. prediction from model
y_pred = predict(object=classifier,newdata=test_set,type="response")
y_pred
## 4 10 15 18 20 23
## -0.52527358 -1.07536127 0.61325867 0.47412110 1.28014160 0.62927749
## 39 46 53 56 63 68
## -1.17474397 0.04458105 0.93423296 1.20284944 -1.22152031 -1.72723762
## 73 76 79 80 81 82
## -0.85261601 -0.98201021 0.24007327 0.89738556 0.64374421 0.70630906
## 83 88 89 91 94 99
## 0.50139905 -0.91104862 -0.88377590 -1.90447357 -1.09418150 -0.10377639
## 108 111 113 117 124 143
## 0.23695374 0.62391581 0.72470129 -1.57854928 0.13458411 1.06132389
## 158 161 166 169 175 176
## -1.23038315 -0.58094171 -0.15085082 0.48225029 0.13100067 1.05225138
## 181 182 190 194 196 211
## 0.28750530 1.25877105 -0.74660089 0.19752428 1.12113343 0.12705115
## 213 215 216 226 239 257
## -1.37124380 -0.72247785 -0.75028744 -0.33679120 0.89710872 -0.20867642
## 263 281 284 293 339 348
## -0.48807816 -0.16249056 -0.03722459 -0.90487977 0.57866459 1.08351234
## 368 369 379 381 391 398
## 0.50783141 0.47670324 0.95012627 0.79481479 0.37948027 0.45091310
plot(y_pred,main = '20MIC0105 - PALEPU VENKATA HEMANTH')
summary(y_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.9045 -0.7475 0.1661 0.0000 0.6329 1.2801
y_pred = ifelse(y_pred>0,1,0)
y_pred
## 4 10 15 18 20 23 39 46 53 56 63 68 73 76 79 80 81 82 83 88
## 0 0 1 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 0
## 89 91 94 99 108 111 113 117 124 143 158 161 166 169 175 176 181 182 190 194
## 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 1 0 1
## 196 211 213 215 216 226 239 257 263 281 284 293 339 348 368 369 379 381 391 398
## 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1
plot(y_pred,main = '20MIC0105 - PALEPU VENKATA HEMANTH')
6. confusion matrix
cm = table(test_set[,8],y_pred)
cm
## y_pred
## 0 1
## 1 26 15
## 2 0 9
## 3 0 10
acc = sum(diag(cm))/sum(cm)
acc
## [1] 0.5833333
#THE END OF ALGORITHMS AND ANALYSIS
#THE GGPLOTS OF THE GIVEN DATASETS
1. import dataset
dataset = read.csv("auto-mpg.csv")
View(dataset)
head(dataset)
## mpg cylinders displacement horsepower weight acceleration model.year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## car.name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
str(dataset)
## 'data.frame': 398 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : chr "130" "165" "150" "150" ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model.year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ car.name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
library(ggplot2)
2. box plot
boxplot(mpg ~ cylinders, data = dataset, xlab = "Number of Cylinders",
ylab = "Miles Per Gallon", main = "Mileage Data")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
ggplot(dataset,aes(x=cylinders,y=mpg))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ggplot(dataset,aes(group = model.year,weight,mpg,origin))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(group = origin,weight,mpg))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,group=weight,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,group=displacement,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,group=acceleration,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
3. histogram
hist(dataset$mpg,xlab = "Weight",col = "yellow",border = "blue")
legend("topright",legend=c("20mic0105 - PALEPU VENKATA HEMANTH"))
hist(dataset$displacement,xlab = "Weight",col = "yellow",border = "blue")
legend("topright",legend=c("20mic0105 - PALEPU VENKATA HEMANTH"))
ggplot(dataset,aes(x=mpg,col='yellow',border='blue'))+geom_histogram()+ggtitle("HISTOGRAM (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(dataset,aes(x=mpg,group=weight,fill = origin))+geom_histogram()+ggtitle("HISTOGRAM (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(dataset,aes(x=mpg,group=displacement,fill = origin))+geom_histogram()+ggtitle("HISTOGRAM (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(dataset,aes(x=mpg,group=acceleration,fill = origin))+geom_histogram()+ggtitle("HISTIGRAM (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
4. scatter plot (only 2 attributes)
plot(x = dataset$weight,y = dataset$mpg,
xlab = "Weight",
ylab = "Milage",
main = "Weight vs Milage"
)
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
ggplot(dataset,aes(x=mpg,y=weight,col='rgb(255, 65, 54)'))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
5. scatter plot (more than 2 variables)
pairs(~weight+mpg+displacement+cylinders,data = dataset,
main = "Scatterplot Matrix(20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=weight,col = origin))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=displacement,col = origin))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=acceleration,col = origin))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=horsepower,col = origin))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=displacement,col = model.year))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=acceleration,col = model.year))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=horsepower,col = model.year))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=weight,col = model.year,size = origin))+geom_point()+ggtitle("SCATTER PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
6. barplot
barplot(dataset$mpg,col = "blue",xlab="Miles Per Gallon",ylab="frequency")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
barplot(dataset$cylinders,col = "green",xlab="cars",ylab="frequency")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
barplot(dataset$mpg,names.arg=dataset$car.name,xlab="Name of cars",ylab="Miles Per Gallon",col="blue",
main="BAR PLOT",border="red")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
barplot(dataset$displacement,names.arg=dataset$car.name,xlab="Name of cars",ylab="Miles Per Gallon",col="blue",
main="BAR PLOT",border="red")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))
ggplot(dataset,aes(x=mpg,y=displacement))+geom_bar(stat = "identity")+ggtitle("BAR PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
7. line plot
ggplot(dataset,aes(x=mpg,y=displacement))+geom_line()+ggtitle("LINE GRAPH (20MIC0105 - PALEPU VENKATA HEMANTH)")
8. BUBBLE PLOT
dif=(dataset$displacement-dataset$mpg)/10
ggplot(dataset,aes(x=mpg,y=displacement,size=dif,col='rgb(255, 65, 54)'))+geom_point()+ggtitle("BUBBLE PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
9. violin plot
ggplot(dataset,aes(x=mpg,y=displacement,size=dif,col='rgb(255, 65, 54)'))+geom_violin()+ggtitle("VIOLIN PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=mpg,y=cylinders,size=dif,col='rgb(255, 65, 54)'))+geom_violin()+ggtitle("VIOLIN PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
10. DENSITY PLOT
ggplot(dataset,aes(x=mpg))+geom_density()+ggtitle("DENSITY PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
ggplot(dataset,aes(x=displacement))+geom_density()+ggtitle("DENSITY PLOT (20MIC0105 - PALEPU VENKATA HEMANTH)")
THE END OF GGPLOTS
THE END