#ANALYSIS OF AUTO-MPG DATASET ##BY Palepu Venkata Hemanth - 20MIC0105

#A. LINEAR REGRESSION

1. import dataset

url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset=read.table(url,header=F,col.names = c("mpg","cylinders","displacement","horsepower","weight","acceleration","model_year", "origin","car_name"))
dataset$horsepower = ifelse(dataset$horsepower == '?',0,dataset$horsepower)
dataset$horsepower <- as.numeric(dataset$horsepower)
as.integer(sum(dataset$horsepower)/392) -> replace_mean
dataset$horsepower = ifelse(dataset$horsepower ==  0,replace_mean,dataset$horsepower)
head(dataset)
##   mpg cylinders displacement horsepower weight acceleration model_year origin
## 1  18         8          307        130   3504         12.0         70      1
## 2  15         8          350        165   3693         11.5         70      1
## 3  18         8          318        150   3436         11.0         70      1
## 4  16         8          304        150   3433         12.0         70      1
## 5  17         8          302        140   3449         10.5         70      1
## 6  15         8          429        198   4341         10.0         70      1
##                    car_name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
str(dataset)
## 'data.frame':    398 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ model_year  : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ car_name    : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caTools)
glimpse(dataset)
## Rows: 398
## Columns: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ model_year   <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ car_name     <chr> "chevrolet chevelle malibu", "buick skylark 320", "plymou…
View(dataset)

2. SPLITTING THE DATA

split = sample.split(dataset$mpg,SplitRatio = 0.75)
training_set=subset(dataset,split==TRUE)
test_set = subset(dataset,split==FALSE)
dim(training_set)
## [1] 324   9
dim(test_set)
## [1] 74  9

3. regression model

reg = lm(formula = mpg~weight+horsepower+cylinders+displacement+acceleration,data = training_set)
reg
## 
## Call:
## lm(formula = mpg ~ weight + horsepower + cylinders + displacement + 
##     acceleration, data = training_set)
## 
## Coefficients:
##  (Intercept)        weight    horsepower     cylinders  displacement  
##    45.348877     -0.006056     -0.036800     -0.173980      0.001188  
## acceleration  
##     0.058783
summary(reg)
## 
## Call:
## lm(formula = mpg ~ weight + horsepower + cylinders + displacement + 
##     acceleration, data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5297  -2.8032  -0.5419   2.3327  15.9618 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.348877   3.003676  15.098  < 2e-16 ***
## weight       -0.006056   0.001049  -5.772 1.86e-08 ***
## horsepower   -0.036800   0.019462  -1.891   0.0595 .  
## cylinders    -0.173980   0.484887  -0.359   0.7200    
## displacement  0.001188   0.011247   0.106   0.9160    
## acceleration  0.058783   0.145438   0.404   0.6864    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.441 on 318 degrees of freedom
## Multiple R-squared:  0.6923, Adjusted R-squared:  0.6875 
## F-statistic: 143.1 on 5 and 318 DF,  p-value: < 2.2e-16
plot(reg,main = "20MIC0105-PALEPU VENKATA HEMANTH")

4. prediction from model

y_pred = predict(reg,newdata = test_set) 
y_pred
##         3         5         9        10        11        14        19        32 
## 18.654565 18.895450 10.009580 14.614133 17.167934 18.117921 29.483840 28.622410 
##        33        34        36        37        41        42        44        46 
## 29.669408 25.714480 21.674233 22.279336 14.382457 14.775487 10.142013 23.420542 
##        47        57        65        70        83        95        96        97 
## 28.704812 31.551533 14.568713 12.294899 26.903082  8.541338  6.883159 15.453070 
##       110       114       115       122       125       126       132       134 
## 28.776592 26.405190 28.652714 18.878619 16.207906 23.229852 32.461776 19.025312 
##       135       136       142       144       151       152       159       160 
## 19.627866 19.799549 29.247654 28.822405 27.791196 31.110658 12.429342 11.520532 
##       169       172       181       182       193       194       201       203 
## 26.783617 25.710847 25.183919 32.969674 21.286115 24.357091 21.323544 22.826489 
##       208       211       212       214       215       227       229       239 
## 22.901743 23.684323 17.938099 15.186994 16.978602 20.968627 20.766691 30.084379 
##       240       243       267       291       292       300       314       348 
## 31.246386 25.756715 30.187213 15.439839 18.725742 24.348324 26.273524 31.542636 
##       366       372       373       374       378       382       384       392 
## 23.779432 27.372424 26.016534 25.048596 31.363830 29.519959 31.278073 28.134681 
##       396       397 
## 28.506549 27.085891
plot(y_pred)
legend("topright",legend = c("20MIC0105-PALEPU VENKATA HEMANTH"))

5. find RMSE AND R-SQUARED VALUES

RMSE = sqrt(mean((y_pred - test_set$mpg)^2))
RMSE
## [1] 3.517136
summary(reg)$r.squared #r-squared value
## [1] 0.6923146

#B. LOGISTIC REGRESSION

1. import dataset

url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset=read.table(url,header=F,col.names = c("mpg","cylinders","displacement","horsepower","weight","acceleration","model_year", "origin","car_name"))
dataset$horsepower = ifelse(dataset$horsepower == '?',0,dataset$horsepower)
dataset$horsepower <- as.numeric(dataset$horsepower)
replace_mean=sum(dataset$horsepower)/392
dataset$horsepower = ifelse(dataset$horsepower ==  0,replace_mean,dataset$horsepower)
View(dataset)
dataset = dataset[1:8]
head(dataset)
##   mpg cylinders displacement horsepower weight acceleration model_year origin
## 1  18         8          307        130   3504         12.0         70      1
## 2  15         8          350        165   3693         11.5         70      1
## 3  18         8          318        150   3436         11.0         70      1
## 4  16         8          304        150   3433         12.0         70      1
## 5  17         8          302        140   3449         10.5         70      1
## 6  15         8          429        198   4341         10.0         70      1
str(dataset)
## 'data.frame':    398 obs. of  8 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ model_year  : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
View(dataset)

2. splitting the dataset

library(caTools)
set.seed(123)
split = sample.split(dataset$mpg,SplitRatio = 0.8)
training_set = subset(dataset,split==TRUE)
test_set = subset(dataset,split==FALSE)
View(training_set)
View(test_set)
dim(training_set)
## [1] 338   8
dim(test_set)
## [1] 60  8

3. feature scalling

training_set[-8] = scale(training_set[-8])
test_set[-8] = scale(test_set[-8])

4. fitting a model

classifier = glm(formula=mpg~weight+horsepower+cylinders+displacement+acceleration,data = training_set,family=gaussian())
classifier
## 
## Call:  glm(formula = mpg ~ weight + horsepower + cylinders + displacement + 
##     acceleration, family = gaussian(), data = training_set)
## 
## Coefficients:
##  (Intercept)        weight    horsepower     cylinders  displacement  
##    3.416e-16    -6.106e-01    -2.004e-01    -6.696e-03    -3.961e-02  
## acceleration  
##    1.676e-02  
## 
## Degrees of Freedom: 337 Total (i.e. Null);  332 Residual
## Null Deviance:       337 
## Residual Deviance: 98.97     AIC: 558
plot(classifier,main = '20MIC0105 - PALEPU VENKATA HEMANTH')

summary(classifier)
## 
## Call:
## glm(formula = mpg ~ weight + horsepower + cylinders + displacement + 
##     acceleration, family = gaussian(), data = training_set)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.45017  -0.35959  -0.06621   0.28915   1.99346  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.416e-16  2.970e-02   0.000   1.0000    
## weight       -6.106e-01  9.211e-02  -6.629 1.37e-10 ***
## horsepower   -2.004e-01  8.312e-02  -2.411   0.0164 *  
## cylinders    -6.696e-03  9.606e-02  -0.070   0.9445    
## displacement -3.961e-02  1.285e-01  -0.308   0.7582    
## acceleration  1.676e-02  4.573e-02   0.366   0.7143    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2980903)
## 
##     Null deviance: 337.000  on 337  degrees of freedom
## Residual deviance:  98.966  on 332  degrees of freedom
## AIC: 558.05
## 
## Number of Fisher Scoring iterations: 2

5. prediction from model

y_pred = predict(object=classifier,newdata=test_set,type="response")
y_pred
##           4          10          15          18          20          23 
## -0.52527358 -1.07536127  0.61325867  0.47412110  1.28014160  0.62927749 
##          39          46          53          56          63          68 
## -1.17474397  0.04458105  0.93423296  1.20284944 -1.22152031 -1.72723762 
##          73          76          79          80          81          82 
## -0.85261601 -0.98201021  0.24007327  0.89738556  0.64374421  0.70630906 
##          83          88          89          91          94          99 
##  0.50139905 -0.91104862 -0.88377590 -1.90447357 -1.09418150 -0.10377639 
##         108         111         113         117         124         143 
##  0.23695374  0.62391581  0.72470129 -1.57854928  0.13458411  1.06132389 
##         158         161         166         169         175         176 
## -1.23038315 -0.58094171 -0.15085082  0.48225029  0.13100067  1.05225138 
##         181         182         190         194         196         211 
##  0.28750530  1.25877105 -0.74660089  0.19752428  1.12113343  0.12705115 
##         213         215         216         226         239         257 
## -1.37124380 -0.72247785 -0.75028744 -0.33679120  0.89710872 -0.20867642 
##         263         281         284         293         339         348 
## -0.48807816 -0.16249056 -0.03722459 -0.90487977  0.57866459  1.08351234 
##         368         369         379         381         391         398 
##  0.50783141  0.47670324  0.95012627  0.79481479  0.37948027  0.45091310
plot(y_pred,main = '20MIC0105 - PALEPU VENKATA HEMANTH')

summary(y_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.9045 -0.7475  0.1661  0.0000  0.6329  1.2801
y_pred = ifelse(y_pred>0,1,0)
y_pred
##   4  10  15  18  20  23  39  46  53  56  63  68  73  76  79  80  81  82  83  88 
##   0   0   1   1   1   1   0   1   1   1   0   0   0   0   1   1   1   1   1   0 
##  89  91  94  99 108 111 113 117 124 143 158 161 166 169 175 176 181 182 190 194 
##   0   0   0   0   1   1   1   0   1   1   0   0   0   1   1   1   1   1   0   1 
## 196 211 213 215 216 226 239 257 263 281 284 293 339 348 368 369 379 381 391 398 
##   1   1   0   0   0   0   1   0   0   0   0   0   1   1   1   1   1   1   1   1
plot(y_pred,main = '20MIC0105 - PALEPU VENKATA HEMANTH')

6. confusion matrix

cm = table(test_set[,8],y_pred)
cm
##    y_pred
##      0  1
##   1 26 15
##   2  0  9
##   3  0 10
acc = sum(diag(cm))/sum(cm)
acc
## [1] 0.5833333

#THE END OF ALGORITHMS AND ANALYSIS

#THE GGPLOTS OF THE GIVEN DATASETS

1. import dataset

dataset = read.csv("auto-mpg.csv")
View(dataset)
head(dataset)
##   mpg cylinders displacement horsepower weight acceleration model.year origin
## 1  18         8          307        130   3504         12.0         70      1
## 2  15         8          350        165   3693         11.5         70      1
## 3  18         8          318        150   3436         11.0         70      1
## 4  16         8          304        150   3433         12.0         70      1
## 5  17         8          302        140   3449         10.5         70      1
## 6  15         8          429        198   4341         10.0         70      1
##                    car.name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
str(dataset)
## 'data.frame':    398 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : chr  "130" "165" "150" "150" ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ model.year  : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ car.name    : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
library(ggplot2)

2. box plot

boxplot(mpg ~ cylinders, data = dataset, xlab = "Number of Cylinders",
        ylab = "Miles Per Gallon", main = "Mileage Data")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

ggplot(dataset,aes(x=cylinders,y=mpg))+geom_boxplot()+ggtitle("BOXPLOT   (20MIC0105 - PALEPU VENKATA HEMANTH)")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(dataset,aes(group = model.year,weight,mpg,origin))+geom_boxplot()+ggtitle("BOXPLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(group = origin,weight,mpg))+geom_boxplot()+ggtitle("BOXPLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,group=weight,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,group=displacement,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,group=acceleration,fill = origin))+geom_boxplot()+ggtitle("BOXPLOT   (20MIC0105 - PALEPU VENKATA HEMANTH)")

3. histogram

hist(dataset$mpg,xlab = "Weight",col = "yellow",border = "blue")
legend("topright",legend=c("20mic0105 - PALEPU VENKATA HEMANTH"))

hist(dataset$displacement,xlab = "Weight",col = "yellow",border = "blue")
legend("topright",legend=c("20mic0105 - PALEPU VENKATA HEMANTH"))

ggplot(dataset,aes(x=mpg,col='yellow',border='blue'))+geom_histogram()+ggtitle("HISTOGRAM    (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dataset,aes(x=mpg,group=weight,fill = origin))+geom_histogram()+ggtitle("HISTOGRAM    (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dataset,aes(x=mpg,group=displacement,fill = origin))+geom_histogram()+ggtitle("HISTOGRAM    (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dataset,aes(x=mpg,group=acceleration,fill = origin))+geom_histogram()+ggtitle("HISTIGRAM   (20MIC0105 - PALEPU VENKATA HEMANTH)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4. scatter plot (only 2 attributes)

plot(x = dataset$weight,y = dataset$mpg,
     xlab = "Weight",
     ylab = "Milage",        
     main = "Weight vs Milage"
)
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

ggplot(dataset,aes(x=mpg,y=weight,col='rgb(255, 65, 54)'))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

5. scatter plot (more than 2 variables)

pairs(~weight+mpg+displacement+cylinders,data = dataset,
      main = "Scatterplot Matrix(20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=weight,col = origin))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=displacement,col = origin))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=acceleration,col = origin))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=horsepower,col = origin))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=displacement,col = model.year))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=acceleration,col = model.year))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=horsepower,col = model.year))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=weight,col = model.year,size = origin))+geom_point()+ggtitle("SCATTER PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

6. barplot

barplot(dataset$mpg,col = "blue",xlab="Miles Per Gallon",ylab="frequency")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

barplot(dataset$cylinders,col = "green",xlab="cars",ylab="frequency")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

barplot(dataset$mpg,names.arg=dataset$car.name,xlab="Name of cars",ylab="Miles Per Gallon",col="blue",
        main="BAR PLOT",border="red")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

barplot(dataset$displacement,names.arg=dataset$car.name,xlab="Name of cars",ylab="Miles Per Gallon",col="blue",
        main="BAR PLOT",border="red")
legend("topright",legend=c("20MIC0105 - PALEPU VENKATA HEMANTH"))

ggplot(dataset,aes(x=mpg,y=displacement))+geom_bar(stat = "identity")+ggtitle("BAR PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

7. line plot

ggplot(dataset,aes(x=mpg,y=displacement))+geom_line()+ggtitle("LINE GRAPH    (20MIC0105 - PALEPU VENKATA HEMANTH)")

8. BUBBLE PLOT

dif=(dataset$displacement-dataset$mpg)/10
ggplot(dataset,aes(x=mpg,y=displacement,size=dif,col='rgb(255, 65, 54)'))+geom_point()+ggtitle("BUBBLE PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

9. violin plot

ggplot(dataset,aes(x=mpg,y=displacement,size=dif,col='rgb(255, 65, 54)'))+geom_violin()+ggtitle("VIOLIN PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=mpg,y=cylinders,size=dif,col='rgb(255, 65, 54)'))+geom_violin()+ggtitle("VIOLIN PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

10. DENSITY PLOT

ggplot(dataset,aes(x=mpg))+geom_density()+ggtitle("DENSITY PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

ggplot(dataset,aes(x=displacement))+geom_density()+ggtitle("DENSITY PLOT    (20MIC0105 - PALEPU VENKATA HEMANTH)")

THE END OF GGPLOTS

THE END