Problem 1

curve(2*x-50, from=1, to=50, step=2, xlab="x", ylab="y")
## Warning in plot.window(...): "step" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "step" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "step" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "step" is not a
## graphical parameter
## Warning in box(...): "step" is not a graphical parameter
## Warning in title(...): "step" is not a graphical parameter

Problem 2.1

mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
    "Acceleration","Model Year","Origin","Car Name"),header = FALSE)

Problem 2.2

summary(mydata)
##       MPG          Cylinders      Displacement     Horsepower    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.00  
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.75  
##  Median :23.00   Median :4.000   Median :151.0   Median : 95.00  
##  Mean   :23.51   Mean   :5.475   Mean   :194.8   Mean   :105.08  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:302.0   3rd Qu.:130.00  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.00  
##  NA's   :8                                       NA's   :6       
##      Weight      Acceleration     Model.Year        Origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2226   1st Qu.:13.70   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2822   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2979   Mean   :15.52   Mean   :75.92   Mean   :1.569  
##  3rd Qu.:3618   3rd Qu.:17.18   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##    Car.Name        
##  Length:406        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Problem 2.3

hist(mydata$Displacement, xlab = "Displacement",main="Displacement Histogram")

qqnorm(mydata$Displacement, main = "Displacement QQ Plot")

Problem 2.4

boxplot(mydata$Weight, horizontal=TRUE, xlab = "Car Weight", main = "Car Weight Box Plot")

The median is closer towards first quartile. The data is also skewed to the right because the interquartile range is closer to the min than the max. There seems to be no outliers. The min is a little past 1500 and the max is past 5000.

Problem 2.5

mydata$Origin <- as.factor(mydata$Origin)
is.factor(mydata$Origin)
## [1] TRUE
boxplot(mydata$MPG[mydata$Origin == 1], mydata$MPG[mydata$Origin == 2], mydata$MPG[mydata$Origin == 3], horizontal = TRUE, main = "MPG for Specified Origin")
library(ggplot2)

ggplot(mydata,aes(x = Origin, y = MPG)) + geom_violin() #violin plot 
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).

2.6

mydata$Weight <- cut(mydata$Weight, breaks=c(1500, 2800, 2950, 5200), labels = c("below", "avg", "above"))
coplot(mydata$Weight~mydata$Origin | mydata$MPG, data = mydata, xlab = "Origin", ylab = "Weight")

## 
##  Missing rows: 11, 12, 13, 14, 15, 18, 40, 368

3.1

cat(nrow(mydata))
## 406

3.2

#which rows contain N/A
#which(is.na(mydata))

#get sum
sum(is.na(mydata))
## [1] 14

3.3

#install.packages("DMwR2")
#install.packages("dplyr")
apply(mydata, 1, function(x) sum(is.na(x)))
##   [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#manyNAs(mydata, 0.1) #return rows with 10% N/A

3.4

mydata$Horsepower[is.na(mydata$Horsepower)] <- mean(mydata$Horsepower, na.rm = TRUE) 
mydata$MPG[is.na(mydata$MPG)] <- mean(mydata$MPG, na.rm = TRUE)

3.5

mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
    "Acceleration","Model Year","Origin","Car Name"),header = FALSE)
contVar <- mydata[, c(1,3,4,5,6)]
cor(contVar)
##              MPG Displacement Horsepower     Weight Acceleration
## MPG            1           NA         NA         NA           NA
## Displacement  NA    1.0000000         NA  0.9324747   -0.5579836
## Horsepower    NA           NA          1         NA           NA
## Weight        NA    0.9324747         NA  1.0000000   -0.4300858
## Acceleration  NA   -0.5579836         NA -0.4300858    1.0000000
cor(contVar, method = "pearson", use = "complete.obs")
##                     MPG Displacement Horsepower     Weight Acceleration
## MPG           1.0000000   -0.8051269 -0.7784268 -0.8322442    0.4233285
## Displacement -0.8051269    1.0000000  0.8972570  0.9329944   -0.5438005
## Horsepower   -0.7784268    0.8972570  1.0000000  0.8645377   -0.6891955
## Weight       -0.8322442    0.9329944  0.8645377  1.0000000   -0.4168392
## Acceleration  0.4233285   -0.5438005 -0.6891955 -0.4168392    1.0000000
lm(mydata$Horsepower ~ mydata$Displacement, data = mydata)
## 
## Call:
## lm(formula = mydata$Horsepower ~ mydata$Displacement, data = mydata)
## 
## Coefficients:
##         (Intercept)  mydata$Displacement  
##             40.3247               0.3308
which(is.na(mydata$Horsepower))
## [1]  39 134 338 344 362 383
#got indices from function above
index1 <- 39
index2 <- 134
index3 <- 338
index4 <- 344
index5 <- 362
index6 <- 383
 
mydata$Displacement[index1] * 0.3308 + 40.3247
## [1] 72.7431
mydata$Displacement[index2] * 0.3308 + 40.3247
## [1] 106.4847
mydata$Displacement[index3] * 0.3308 + 40.3247
## [1] 68.4427
mydata$Displacement[index4] * 0.3308 + 40.3247
## [1] 86.6367
mydata$Displacement[index5] * 0.3308 + 40.3247
## [1] 73.4047
mydata$Displacement[index6] * 0.3308 + 40.3247
## [1] 90.2755
mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
    "Acceleration","Model Year","Origin","Car Name"),header = FALSE)