Problem 1. (20%) Please use ggpairs in package GGally to explore the Titanic data set. If our goal is to predict the survival status of passengers,could you find some potential predictors based on the result of ggpairs?
dta <- read.csv(file="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv",h=T,sep=",")
dta <- dta[,c(2,1,4,5,6,7,9,11)]
dta$survived <- as.factor(dta$survived)
dta$pclass<- as.factor(dta$pclass)
dta <- subset(dta,embarked!="")
dta$embarked <- factor(dta$embarked)
library(GGally);library(ggplot2)
ggplot <- function(...) ggplot2::ggplot(...) + scale_color_brewer(palette="Set1")+ scale_fill_brewer(palette="Set1")
unlockBinding("ggplot",parent.env(asNamespace("GGally")))
assign("ggplot",ggplot,parent.env(asNamespace("GGally")))
ggpairs(dta,column=1:8,mapping =aes(col=survived),
upper = list(continuous = "cor", combo = "box",discrete = "ratio"),
lower = list(continuous = "smooth", combo = "facethist",discrete = "facetbar"))
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
## Warning: Removed 263 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 263 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 263 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## Warning: Removed 263 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 263 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 263 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 264 rows containing missing
## values
## Warning: Removed 263 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_smooth).
## Warning: Removed 263 rows containing missing values (geom_point).
## Warning in fn(x$data, plotObj$mapping): Removing 1 row that contained a
## missing value
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_smooth).
## Warning: Removed 263 rows containing missing values (geom_point).
## Warning in fn(x$data, plotObj$mapping): Removing 1 row that contained a
## missing value
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 264 rows containing non-finite values (stat_smooth).
## Warning: Removed 264 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
problem4
dta <- read.csv(file="http://www.amstat.org/publications/jse/datasets/baseball.dat.txt",h=F,na.strings = NA,sep="")
df <- dta[, c(1, 2, 3, 5, 8, 12, 13, 14)]
rownames(df) <- dta[,18]
colnames(df) <- c("salary", "BA", "OBP", "H", "HR", "SB", "E", "FAE")
fit1 <- lm(salary~H,data=df)
e_salary <- residuals(fit1)
head(round(e_salary,2))
## Andre Dawson Steve Buchele Kal Daniels Shawon Dunston
## 1158.37 1081.81 922.44 704.47
## Mark Grace Ryne Sandberg
## -66.13 -218.98
fit2 <- lm(HR~H,data=df)
e_HR <- residuals(fit2)
head(round(e_HR,2))
## Andre Dawson Steve Buchele Kal Daniels Shawon Dunston
## 15.29 6.91 5.47 -0.96
## Mark Grace Ryne Sandberg
## -9.47 8.42
You can also embed plots, for example:
fit3 <- lm(e_salary~e_HR)
summary(fit3)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.333209e-15 49.892412 8.685106e-17 1.000000e+00
## e_HR 4.478106e+01 6.814331 6.571600e+00 1.906118e-10
fit4 <- lm(salary~HR+H,data=df)
summary(fit4)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -79.97622 102.794373 -0.7780214 4.371072e-01
## HR 44.78106 6.824524 6.5617848 2.028157e-10
## H 9.92192 1.221655 8.1217059 8.980489e-15
head(dta) ; rownames(dta) <- dta[,18];dta <- dta[,-18]
colnames(dta) <- c("salary", "BA", "OBP","R" ,"H","2B","3B", "HR","RBI","BB","SO","SB", "E", "FAE","FA91","AE","A91")
min_model = lm(salary ~ 1, data=dta)
max_model <- formula(lm(salary~.,data=dta))
step_model = step(min_model, direction='both', scope=max_model)
max_model_2 <- formula(lm(salary~RBI*FAE*AE*SO*SB*HR*FA91*R*E,data=dta))
step_model = step(min_model, direction='both', scope=max_model_2)
summary((max_model <- lm(salary~.,data=dta)))$r.squared
## [1] 0.7014386
max_5_model <- lm(salary ~ RBI + FAE + AE + RBI:FAE + RBI:AE,data=dta)
summary(max_5_model)$r.squared
## [1] 0.7191078
max_10_model <- (lm(salary ~ RBI + FAE + AE + SB + SO + FA91 + RBI:FAE + RBI:AE +
SB:SO + RBI:FA91 ,data=dta))
summary(max_10_model)$r.squared
## [1] 0.7557875
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.