Problem 1
curve(2*x-50, from=1, to=50, step=2, xlab="x", ylab="y")
## Warning in plot.window(...): "step" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "step" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "step" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "step" is not a
## graphical parameter
## Warning in box(...): "step" is not a graphical parameter
## Warning in title(...): "step" is not a graphical parameter
Problem 2.1
mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
"Acceleration","Model Year","Origin","Car Name"),header = FALSE)
Problem 2.2
summary(mydata)
## MPG Cylinders Displacement Horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.00
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.75
## Median :23.00 Median :4.000 Median :151.0 Median : 95.00
## Mean :23.51 Mean :5.475 Mean :194.8 Mean :105.08
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:302.0 3rd Qu.:130.00
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.00
## NA's :8 NA's :6
## Weight Acceleration Model.Year Origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2226 1st Qu.:13.70 1st Qu.:73.00 1st Qu.:1.000
## Median :2822 Median :15.50 Median :76.00 Median :1.000
## Mean :2979 Mean :15.52 Mean :75.92 Mean :1.569
## 3rd Qu.:3618 3rd Qu.:17.18 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## Car.Name
## Length:406
## Class :character
## Mode :character
##
##
##
##
Problem 2.3
hist(mydata$Displacement, xlab = "Displacement",main="Displacement Histogram")
qqnorm(mydata$Displacement, main = "Displacement QQ Plot")
Problem 2.4
boxplot(mydata$Weight, horizontal=TRUE, xlab = "Car Weight", main = "Car Weight Box Plot")
The median is closer towards first quartile. The data is also skewed to the right because the interquartile range is closer to the min than the max. There seems to be no outliers. The min is a little past 1500 and the max is past 5000.
Problem 2.5
mydata$Origin <- as.factor(mydata$Origin)
is.factor(mydata$Origin)
## [1] TRUE
boxplot(mydata$MPG[mydata$Origin == 1], mydata$MPG[mydata$Origin == 2], mydata$MPG[mydata$Origin == 3], horizontal = TRUE, main = "MPG for Specified Origin")
library(ggplot2)
ggplot(mydata,aes(x = Origin, y = MPG)) + geom_violin() #violin plot
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).
2.6
mydata$Weight <- cut(mydata$Weight, breaks=c(1500, 2800, 2950, 5200), labels = c("below", "avg", "above"))
coplot(mydata$Weight~mydata$Origin | mydata$MPG, data = mydata, xlab = "Origin", ylab = "Weight")
##
## Missing rows: 11, 12, 13, 14, 15, 18, 40, 368
3.1
cat(nrow(mydata))
## 406
3.2
#which rows contain N/A
#which(is.na(mydata))
#get sum
sum(is.na(mydata))
## [1] 14
3.3
#install.packages("DMwR2")
#install.packages("dplyr")
apply(mydata, 1, function(x) sum(is.na(x)))
## [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#manyNAs(mydata, 0.1) #return rows with 10% N/A
3.4
mydata$Horsepower[is.na(mydata$Horsepower)] <- mean(mydata$Horsepower, na.rm = TRUE)
mydata$MPG[is.na(mydata$MPG)] <- mean(mydata$MPG, na.rm = TRUE)
3.5
mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
"Acceleration","Model Year","Origin","Car Name"),header = FALSE)
contVar <- mydata[, c(1,3,4,5,6)]
cor(contVar)
## MPG Displacement Horsepower Weight Acceleration
## MPG 1 NA NA NA NA
## Displacement NA 1.0000000 NA 0.9324747 -0.5579836
## Horsepower NA NA 1 NA NA
## Weight NA 0.9324747 NA 1.0000000 -0.4300858
## Acceleration NA -0.5579836 NA -0.4300858 1.0000000
cor(contVar, method = "pearson", use = "complete.obs")
## MPG Displacement Horsepower Weight Acceleration
## MPG 1.0000000 -0.8051269 -0.7784268 -0.8322442 0.4233285
## Displacement -0.8051269 1.0000000 0.8972570 0.9329944 -0.5438005
## Horsepower -0.7784268 0.8972570 1.0000000 0.8645377 -0.6891955
## Weight -0.8322442 0.9329944 0.8645377 1.0000000 -0.4168392
## Acceleration 0.4233285 -0.5438005 -0.6891955 -0.4168392 1.0000000
lm(mydata$Horsepower ~ mydata$Displacement, data = mydata)
##
## Call:
## lm(formula = mydata$Horsepower ~ mydata$Displacement, data = mydata)
##
## Coefficients:
## (Intercept) mydata$Displacement
## 40.3247 0.3308
which(is.na(mydata$Horsepower))
## [1] 39 134 338 344 362 383
#got indices from function above
index1 <- 39
index2 <- 134
index3 <- 338
index4 <- 344
index5 <- 362
index6 <- 383
mydata$Displacement[index1] * 0.3308 + 40.3247
## [1] 72.7431
mydata$Displacement[index2] * 0.3308 + 40.3247
## [1] 106.4847
mydata$Displacement[index3] * 0.3308 + 40.3247
## [1] 68.4427
mydata$Displacement[index4] * 0.3308 + 40.3247
## [1] 86.6367
mydata$Displacement[index5] * 0.3308 + 40.3247
## [1] 73.4047
mydata$Displacement[index6] * 0.3308 + 40.3247
## [1] 90.2755
mydata =read.table("auto-mpg.data-original", col.names= c("MPG","Cylinders","Displacement","Horsepower","Weight",
"Acceleration","Model Year","Origin","Car Name"),header = FALSE)