library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
m1<-lm(hp~mpg,data=mtcars)
summary(m1)
##
## Call:
## lm(formula = hp ~ mpg, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.26 -28.93 -13.45 25.65 143.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 324.08 27.43 11.813 8.25e-13 ***
## mpg -8.83 1.31 -6.742 1.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.95 on 30 degrees of freedom
## Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
## F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07
ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point() + geom_abline(intercept = 325,slope = -9,col=4) + geom_abline(intercept = 326,slope = -7,col=2) + geom_abline(intercept = 322,slope = -8,col=3) + scale_x_continuous("MPG") + scale_y_continuous("HP")
ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point(col=3) + scale_x_continuous("MPG") + scale_y_continuous("HP") + geom_smooth(method="lm",se=F) +theme_bw()
Simple linear regression equation
\[ \ y_{i} = \ b_{0} + \ b_{1}\times {x} \]
\[ \ b_{1} = r_{(x,y)}\times \frac{S_{y}}{S_{x}} \]
\[ \ b_{0}= \ mean_{y} - \ b_{1}\times {mean_{x}} \] * How to calculate slope and intercept by hand
library(dplyr)
my_cal<- mtcars %>% dplyr::select(hp,mpg) %>% dplyr::summarise(slope=cor(hp,mpg)*sd(hp,na.rm = T)/sd(mpg,na.rm = T), intercept=mean(hp,na.rm = T)-slope*mean(mpg,na.rm = T))
my_cal
## slope intercept
## 1 -8.829731 324.0823
library(broom)
## Warning: package 'broom' was built under R version 3.2.5
my_data<-augment(m1)
mean(my_data$.se.fit)
## [1] 10.52047
head(my_data)
## .rownames hp mpg .fitted .se.fit .resid .hat
## 1 Mazda RX4 110 21.0 138.6580 7.859249 -28.65796 0.03198439
## 2 Mazda RX4 Wag 110 21.0 138.6580 7.859249 -28.65796 0.03198439
## 3 Datsun 710 93 22.8 122.7644 8.540431 -29.76445 0.03776901
## 4 Hornet 4 Drive 110 21.4 135.1261 7.955493 -25.12607 0.03277255
## 5 Hornet Sportabout 175 18.7 158.9663 7.979104 16.03366 0.03296737
## 6 Valiant 105 18.1 164.2642 8.194232 -59.26418 0.03476902
## .sigma .cooksd .std.resid
## 1 44.36803 0.007257885 -0.6628147
## 2 44.36803 0.007257885 -0.6628147
## 3 44.33994 0.009356611 -0.6904721
## 4 44.44402 0.005725958 -0.5813642
## 5 44.59385 0.002346460 0.3710223
## 6 43.27012 0.033935999 -1.3726654
mpgnew_value<-data.frame(mpg=c(25.6))
new_pred<-augment(m1,newdata=new_value)
# present predicted response value on the plot
ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point() + geom_smooth(method="lm",se=F) + geom_point(data=new_pred,aes(y=.fitted),col=2)
my_R_squared<-my_data %>% dplyr::summarise(var_y=var(hp),var_e=var(.resid)) %>% dplyr::mutate(R_squared=1-var_e/var_y)
head(my_R_squared)
## var_y var_e R_squared
## 1 4700.867 1868.889 0.6024373
SSE_full<-var(my_data$.resid)
SSE_full
## [1] 1868.889
# null model
null_model<-lm(hp~1,data = mtcars)
null_data<-augment(null_model)
var(null_data$.resid)
## [1] 4700.867
CPS85 dataset is from mosaicdata package
library(mosaic)
## Warning: package 'mosaic' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
## Loading required package: mosaicData
## Warning: package 'mosaicData' was built under R version 3.2.5
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## The 'mosaic' package masks several functions from core packages in order to add additional features.
## The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
head(CPS85)
## wage educ race sex hispanic south married exper union age sector
## 1 9.0 10 W M NH NS Married 27 Not 43 const
## 2 5.5 12 W M NH NS Married 20 Not 38 sales
## 3 3.8 12 W F NH NS Single 4 Not 22 sales
## 4 10.5 12 W F NH NS Married 29 Not 47 clerical
## 5 15.0 12 W M NH NS Married 40 Union 58 const
## 6 9.0 16 W F NH NS Married 27 Not 49 clerical
mean(age~sector,data=CPS85)
## clerical const manag manuf other prof sales service
## 36.51546 38.70000 38.65455 36.97059 33.14706 37.05714 37.65789 37.79518
# predict a new value using evaluate_model() fuction
library(statisticalModeling)
## Warning: package 'statisticalModeling' was built under R version 3.2.5
predict(m1,newdata = new_value)
## 1
## 98.0412
# alternative
evaluate_model(m1,data=new_value)
## mpg model_output
## 1 25.6 98.0412
NHANES from dplyr packagelibrary(dplyr)
library(NHANES)
## Warning: package 'NHANES' was built under R version 3.2.5
# building a random tree model
head(NHANES)
## # A tibble: 6 × 76
## ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3
## <int> <fctr> <fctr> <int> <fctr> <int> <fctr> <fctr>
## 1 51624 2009_10 male 34 30-39 409 White NA
## 2 51624 2009_10 male 34 30-39 409 White NA
## 3 51624 2009_10 male 34 30-39 409 White NA
## 4 51625 2009_10 male 4 0-9 49 Other NA
## 5 51630 2009_10 female 49 40-49 596 White NA
## 6 51638 2009_10 male 9 0-9 115 White NA
## # ... with 68 more variables: Education <fctr>, MaritalStatus <fctr>,
## # HHIncome <fctr>, HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>,
## # HomeOwn <fctr>, Work <fctr>, Weight <dbl>, Length <dbl>,
## # HeadCirc <dbl>, Height <dbl>, BMI <dbl>, BMICatUnder20yrs <fctr>,
## # BMI_WHO <fctr>, Pulse <int>, BPSysAve <int>, BPDiaAve <int>,
## # BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>, BPSys3 <int>,
## # BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>,
## # UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>,
## # Diabetes <fctr>, DiabetesAge <int>, HealthGen <fctr>,
## # DaysPhysHlthBad <int>, DaysMentHlthBad <int>, LittleInterest <fctr>,
## # Depressed <fctr>, nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## # SleepHrsNight <int>, SleepTrouble <fctr>, PhysActive <fctr>,
## # PhysActiveDays <int>, TVHrsDay <fctr>, CompHrsDay <fctr>,
## # TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fctr>,
## # AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fctr>, Smoke100 <fctr>,
## # Smoke100n <fctr>, SmokeAge <int>, Marijuana <fctr>,
## # AgeFirstMarij <int>, RegularMarij <fctr>, AgeRegMarij <int>,
## # HardDrugs <fctr>, SexEver <fctr>, SexAge <int>, SexNumPartnLife <int>,
## # SexNumPartYear <int>, SameSex <fctr>, SexOrientation <fctr>,
## # PregnantNow <fctr>
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
ml_r<-rpart(SmokeNow~Marijuana+Poverty+Weight+TotChol,data=NHANES,cp=0.002)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.5
prp(ml_r,type=4,extra=100,varlen=0)
SAT from mosaicData packagehead(SAT)
## state expend ratio salary frac verbal math sat
## 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 2 Alaska 8.963 17.6 47.951 47 445 489 934
## 3 Arizona 4.778 19.3 32.175 27 448 496 944
## 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 5 California 4.992 24.0 41.078 45 417 485 902
## 6 Colorado 5.443 18.4 34.571 29 462 518 980