ML_HW1_Part

Problem 1. (20%) Please use ggpairs in package GGally to explore the Titanic data set. If our goal is to predict the survival status of passengers,could you find some potential predictors based on the result of ggpairs?

dta <- read.csv(file="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv",h=T,sep=",")
dta <- dta[,c(2,1,4,5,6,7,9,11)]
dta$survived <- as.factor(dta$survived)
dta$pclass<- as.factor(dta$pclass)
dta <- subset(dta,embarked!="")
dta$embarked <- factor(dta$embarked)
library(GGally);library(ggplot2)
ggplot <- function(...) ggplot2::ggplot(...) + scale_color_brewer(palette="Set1")+ scale_fill_brewer(palette="Set1")
unlockBinding("ggplot",parent.env(asNamespace("GGally")))
assign("ggplot",ggplot,parent.env(asNamespace("GGally")))
ggpairs(dta,column=1:8,mapping =aes(col=survived),
        upper = list(continuous = "cor", combo = "box",discrete = "ratio"),
        lower = list(continuous = "smooth", combo = "facethist",discrete = "facetbar"))

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

## Warning: Removed 263 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 263 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 263 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_bin).

## Warning: Removed 263 rows containing non-finite values (stat_density).

## Warning in fn(x$data, plotObj$mapping): Removed 263 rows containing missing
## values

## Warning in fn(x$data, plotObj$mapping): Removed 263 rows containing missing
## values

## Warning in fn(x$data, plotObj$mapping): Removed 264 rows containing missing
## values

## Warning: Removed 263 rows containing non-finite values (stat_boxplot).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_smooth).

## Warning: Removed 263 rows containing missing values (geom_point).

## Warning in fn(x$data, plotObj$mapping): Removing 1 row that contained a
## missing value

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_smooth).

## Warning: Removed 263 rows containing missing values (geom_point).

## Warning in fn(x$data, plotObj$mapping): Removing 1 row that contained a
## missing value

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 264 rows containing non-finite values (stat_smooth).

## Warning: Removed 264 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 263 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

problem4

dta <- read.csv(file="http://www.amstat.org/publications/jse/datasets/baseball.dat.txt",h=F,na.strings = NA,sep="")
df <- dta[, c(1, 2, 3, 5, 8, 12, 13, 14)]
rownames(df) <- dta[,18]
colnames(df) <- c("salary", "BA", "OBP", "H", "HR", "SB", "E", "FAE")

fit1 <- lm(salary~H,data=df)
e_salary <- residuals(fit1)
head(round(e_salary,2))

## Andre Dawson      Steve Buchele     Kal Daniels       Shawon Dunston    
##           1158.37           1081.81            922.44            704.47 
## Mark Grace        Ryne Sandberg     
##            -66.13           -218.98

fit2 <- lm(HR~H,data=df)
e_HR <- residuals(fit2)
head(round(e_HR,2))

## Andre Dawson      Steve Buchele     Kal Daniels       Shawon Dunston    
##             15.29              6.91              5.47             -0.96 
## Mark Grace        Ryne Sandberg     
##             -9.47              8.42

You can also embed plots, for example:

fit3 <- lm(e_salary~e_HR)
summary(fit3)$coefficients

##                 Estimate Std. Error      t value     Pr(>|t|)
## (Intercept) 4.333209e-15  49.892412 8.685106e-17 1.000000e+00
## e_HR        4.478106e+01   6.814331 6.571600e+00 1.906118e-10

fit4 <-  lm(salary~HR+H,data=df)
summary(fit4)$coefficients

##              Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -79.97622 102.794373 -0.7780214 4.371072e-01
## HR           44.78106   6.824524  6.5617848 2.028157e-10
## H             9.92192   1.221655  8.1217059 8.980489e-15

head(dta) ; rownames(dta) <- dta[,18];dta <- dta[,-18]
colnames(dta) <- c("salary", "BA", "OBP","R" ,"H","2B","3B", "HR","RBI","BB","SO","SB", "E", "FAE","FA91","AE","A91")
min_model = lm(salary ~ 1, data=dta)
max_model <- formula(lm(salary~.,data=dta))
step_model = step(min_model, direction='both', scope=max_model)
max_model_2 <- formula(lm(salary~RBI*FAE*AE*SO*SB*HR*FA91*R*E,data=dta))
step_model = step(min_model, direction='both', scope=max_model_2)

summary((max_model <- lm(salary~.,data=dta)))$r.squared

## [1] 0.7014386

max_5_model <- lm(salary ~ RBI + FAE + AE + RBI:FAE + RBI:AE,data=dta)
summary(max_5_model)$r.squared

## [1] 0.7191078

max_10_model <- (lm(salary ~ RBI + FAE + AE + SB + SO + FA91 + RBI:FAE + RBI:AE + 
    SB:SO + RBI:FA91 ,data=dta))
summary(max_10_model)$r.squared

## [1] 0.7557875

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

ML_HW1_Part

Chi-Lin Yu

March 13, 2016