library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
m1<-lm(hp~mpg,data=mtcars)

summary(m1)
## 
## Call:
## lm(formula = hp ~ mpg, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.26 -28.93 -13.45  25.65 143.36 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   324.08      27.43  11.813 8.25e-13 ***
## mpg            -8.83       1.31  -6.742 1.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.95 on 30 degrees of freedom
## Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
## F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07
ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point() + geom_abline(intercept = 325,slope = -9,col=4) + geom_abline(intercept = 326,slope = -7,col=2) + geom_abline(intercept = 322,slope = -8,col=3) + scale_x_continuous("MPG") + scale_y_continuous("HP")

ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point(col=3) +  scale_x_continuous("MPG") + scale_y_continuous("HP") + geom_smooth(method="lm",se=F) +theme_bw()

Simple linear regression equation

\[ \ y_{i} = \ b_{0} + \ b_{1}\times {x} \]

\[ \ b_{1} = r_{(x,y)}\times \frac{S_{y}}{S_{x}} \]

\[ \ b_{0}= \ mean_{y} - \ b_{1}\times {mean_{x}} \] * How to calculate slope and intercept by hand

library(dplyr)
my_cal<- mtcars %>% dplyr::select(hp,mpg) %>% dplyr::summarise(slope=cor(hp,mpg)*sd(hp,na.rm = T)/sd(mpg,na.rm = T), intercept=mean(hp,na.rm = T)-slope*mean(mpg,na.rm = T))

my_cal
##       slope intercept
## 1 -8.829731  324.0823
library(broom)
## Warning: package 'broom' was built under R version 3.2.5
my_data<-augment(m1)

mean(my_data$.se.fit)
## [1] 10.52047
head(my_data)
##           .rownames  hp  mpg  .fitted  .se.fit    .resid       .hat
## 1         Mazda RX4 110 21.0 138.6580 7.859249 -28.65796 0.03198439
## 2     Mazda RX4 Wag 110 21.0 138.6580 7.859249 -28.65796 0.03198439
## 3        Datsun 710  93 22.8 122.7644 8.540431 -29.76445 0.03776901
## 4    Hornet 4 Drive 110 21.4 135.1261 7.955493 -25.12607 0.03277255
## 5 Hornet Sportabout 175 18.7 158.9663 7.979104  16.03366 0.03296737
## 6           Valiant 105 18.1 164.2642 8.194232 -59.26418 0.03476902
##     .sigma     .cooksd .std.resid
## 1 44.36803 0.007257885 -0.6628147
## 2 44.36803 0.007257885 -0.6628147
## 3 44.33994 0.009356611 -0.6904721
## 4 44.44402 0.005725958 -0.5813642
## 5 44.59385 0.002346460  0.3710223
## 6 43.27012 0.033935999 -1.3726654
new_value<-data.frame(mpg=c(25.6))

new_pred<-augment(m1,newdata=new_value)
# present predicted response value on the plot

ggplot(data=mtcars,aes(x=mpg,y=hp)) + geom_point() + geom_smooth(method="lm",se=F) + geom_point(data=new_pred,aes(y=.fitted),col=2)

my_R_squared<-my_data %>% dplyr::summarise(var_y=var(hp),var_e=var(.resid)) %>% dplyr::mutate(R_squared=1-var_e/var_y)

head(my_R_squared)
##      var_y    var_e R_squared
## 1 4700.867 1868.889 0.6024373
SSE_full<-var(my_data$.resid)

SSE_full
## [1] 1868.889
# null model

null_model<-lm(hp~1,data = mtcars)

null_data<-augment(null_model)

var(null_data$.resid)
## [1] 4700.867

CPS85 dataset is from mosaicdata package

library(mosaic)
## Warning: package 'mosaic' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
## Loading required package: mosaicData
## Warning: package 'mosaicData' was built under R version 3.2.5
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The 'mosaic' package masks several functions from core packages in order to add additional features.  
## The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
head(CPS85)
##   wage educ race sex hispanic south married exper union age   sector
## 1  9.0   10    W   M       NH    NS Married    27   Not  43    const
## 2  5.5   12    W   M       NH    NS Married    20   Not  38    sales
## 3  3.8   12    W   F       NH    NS  Single     4   Not  22    sales
## 4 10.5   12    W   F       NH    NS Married    29   Not  47 clerical
## 5 15.0   12    W   M       NH    NS Married    40 Union  58    const
## 6  9.0   16    W   F       NH    NS Married    27   Not  49 clerical
mean(age~sector,data=CPS85)
## clerical    const    manag    manuf    other     prof    sales  service 
## 36.51546 38.70000 38.65455 36.97059 33.14706 37.05714 37.65789 37.79518
# predict a new value using evaluate_model() fuction
library(statisticalModeling)
## Warning: package 'statisticalModeling' was built under R version 3.2.5
predict(m1,newdata = new_value)
##       1 
## 98.0412
# alternative
evaluate_model(m1,data=new_value)
##    mpg model_output
## 1 25.6      98.0412
library(dplyr)

library(NHANES)
## Warning: package 'NHANES' was built under R version 3.2.5
# building a random tree model

head(NHANES)
## # A tibble: 6 × 76
##      ID SurveyYr Gender   Age AgeDecade AgeMonths  Race1  Race3
##   <int>   <fctr> <fctr> <int>    <fctr>     <int> <fctr> <fctr>
## 1 51624  2009_10   male    34     30-39       409  White     NA
## 2 51624  2009_10   male    34     30-39       409  White     NA
## 3 51624  2009_10   male    34     30-39       409  White     NA
## 4 51625  2009_10   male     4       0-9        49  Other     NA
## 5 51630  2009_10 female    49     40-49       596  White     NA
## 6 51638  2009_10   male     9       0-9       115  White     NA
## # ... with 68 more variables: Education <fctr>, MaritalStatus <fctr>,
## #   HHIncome <fctr>, HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>,
## #   HomeOwn <fctr>, Work <fctr>, Weight <dbl>, Length <dbl>,
## #   HeadCirc <dbl>, Height <dbl>, BMI <dbl>, BMICatUnder20yrs <fctr>,
## #   BMI_WHO <fctr>, Pulse <int>, BPSysAve <int>, BPDiaAve <int>,
## #   BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>, BPSys3 <int>,
## #   BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>,
## #   UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, UrineFlow2 <dbl>,
## #   Diabetes <fctr>, DiabetesAge <int>, HealthGen <fctr>,
## #   DaysPhysHlthBad <int>, DaysMentHlthBad <int>, LittleInterest <fctr>,
## #   Depressed <fctr>, nPregnancies <int>, nBabies <int>, Age1stBaby <int>,
## #   SleepHrsNight <int>, SleepTrouble <fctr>, PhysActive <fctr>,
## #   PhysActiveDays <int>, TVHrsDay <fctr>, CompHrsDay <fctr>,
## #   TVHrsDayChild <int>, CompHrsDayChild <int>, Alcohol12PlusYr <fctr>,
## #   AlcoholDay <int>, AlcoholYear <int>, SmokeNow <fctr>, Smoke100 <fctr>,
## #   Smoke100n <fctr>, SmokeAge <int>, Marijuana <fctr>,
## #   AgeFirstMarij <int>, RegularMarij <fctr>, AgeRegMarij <int>,
## #   HardDrugs <fctr>, SexEver <fctr>, SexAge <int>, SexNumPartnLife <int>,
## #   SexNumPartYear <int>, SameSex <fctr>, SexOrientation <fctr>,
## #   PregnantNow <fctr>
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
ml_r<-rpart(SmokeNow~Marijuana+Poverty+Weight+TotChol,data=NHANES,cp=0.002)

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.5
prp(ml_r,type=4,extra=100,varlen=0)

head(SAT)
##        state expend ratio salary frac verbal math  sat
## 1    Alabama  4.405  17.2 31.144    8    491  538 1029
## 2     Alaska  8.963  17.6 47.951   47    445  489  934
## 3    Arizona  4.778  19.3 32.175   27    448  496  944
## 4   Arkansas  4.459  17.1 28.934    6    482  523 1005
## 5 California  4.992  24.0 41.078   45    417  485  902
## 6   Colorado  5.443  18.4 34.571   29    462  518  980