Auto <- read.table("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data",
header=TRUE,
na.strings = c("?","NA"))
filtered_auto <- Filter(is.numeric,Auto)
filtered_auto$origin <- NULL
for(i in names(filtered_auto)) {
cat("\nPREDICTOR NAME IS ", i, " RANGE IS ",range(Auto[[i]],na.rm=T), " MEAN IS ", mean(Auto[[i]],na.rm=T), " SDEV IS ",sd(Auto[[i]],na.rm=T),"\n")
}
##
## PREDICTOR NAME IS mpg RANGE IS 9 46.6 MEAN IS 23.51587 SDEV IS 7.825804
##
## PREDICTOR NAME IS cylinders RANGE IS 3 8 MEAN IS 5.458438 SDEV IS 1.701577
##
## PREDICTOR NAME IS displacement RANGE IS 68 455 MEAN IS 193.5327 SDEV IS 104.3796
##
## PREDICTOR NAME IS horsepower RANGE IS 46 230 MEAN IS 104.4694 SDEV IS 38.49116
##
## PREDICTOR NAME IS weight RANGE IS 1613 5140 MEAN IS 2970.262 SDEV IS 847.9041
##
## PREDICTOR NAME IS acceleration RANGE IS 8 24.8 MEAN IS 15.55567 SDEV IS 2.749995
##
## PREDICTOR NAME IS year RANGE IS 70 82 MEAN IS 75.99496 SDEV IS 3.690005
The 10th through 85th entry were then removed, and the range, mean and standard deviation were displayed the same way:
filtered_auto_some_entries <- filtered_auto[c(1:9, 86:nrow(Auto)),]
for(i in names(filtered_auto_some_entries)) {
cat("\nPREDICTOR NAME IS ", i, " RANGE IS ",range(filtered_auto_some_entries[[i]],na.rm = T), " MEAN IS ", mean(filtered_auto_some_entries[[i]],na.rm=T), " SDEV IS ",sd(filtered_auto_some_entries[[i]],na.rm=T), "\n")
}
##
## PREDICTOR NAME IS mpg RANGE IS 11 46.6 MEAN IS 24.43863 SDEV IS 7.908184
##
## PREDICTOR NAME IS cylinders RANGE IS 3 8 MEAN IS 5.370717 SDEV IS 1.653486
##
## PREDICTOR NAME IS displacement RANGE IS 68 455 MEAN IS 187.0498 SDEV IS 99.63539
##
## PREDICTOR NAME IS horsepower RANGE IS 46 230 MEAN IS 100.9558 SDEV IS 35.89557
##
## PREDICTOR NAME IS weight RANGE IS 1649 4997 MEAN IS 2933.963 SDEV IS 810.6429
##
## PREDICTOR NAME IS acceleration RANGE IS 8.5 24.8 MEAN IS 15.72305 SDEV IS 2.680514
##
## PREDICTOR NAME IS year RANGE IS 70 82 MEAN IS 77.15265 SDEV IS 3.11123
Next, I created various plots for the predictors.
pairs(Auto)
plot(Auto$mpg, Auto$horsepower, xlab="mpg", ylab="horsepower")
plot(Auto$weight, Auto$mpg, xlab="weight", ylab="mpg")
plot(Auto$acceleration, Auto$horsepower, xlab="acceleration", ylab="horsepower")
I found that many variables seem to be correlated. 3 examples are listed here: mpg and horsepower seem to have a relationship, as do mpg and weight, and horsepower and acceleration.
My plots suggest that the most valuable variables for predicting mpg are displacement, horsepower, weight, and year, since the graphs seem to be fairly linear/quadratic.
new_hope <- c(460.998, 314.4)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
region <- c("US", "non-US")
titles <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
starWars <- rbind(new_hope, empire_strikes, return_jedi)
rownames(starWars) <- titles
colnames(starWars) <- region
starWars
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
star_wars_row_sums <- rowSums(starWars)
star_wars_row_sums
## A New Hope The Empire Strikes Back Return of the Jedi
## 775.398 538.375 475.106
The code to create allStarWars and print total non-us revenue is as follows:
phantom_menace <- c(474.5, 552.5)
attack_clones <- c(310.7, 338.7)
revenge_sith <- c(380.3, 468.5)
titles2<-c("The Phantom Menace", "Attack of the Clones","Revenge of the Sith")
starWars2 <- rbind(phantom_menace, attack_clones, revenge_sith)
rownames(starWars2) <- titles2
colnames(starWars2) <- region
allStarWars <- rbind(starWars, starWars2)
colSums(allStarWars)
## US non-US
## 2226.279 2087.800
Therefore the total non-US revenue is 2087.800 million.
college<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)
rownames(college) <- college[,1]
View(college)
college <- college[,-1]
View(college)
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(college[,1:10])
plot(college$Private,college$Outstate, xlab="Private", ylab="Outstate")
I then created the variable Elite, and added it to the college dataframe. Then I created a side-by-side boxplot of Outstate vs Elite.
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] = "Yes"
college <- data.frame(college, Elite)
#summary(Elite)
plot(college$Elite, college$Outstate, xlab = "Is college elite", ylab = "Outstate")