Data #1

Relationship between Price and other variable

plot(Price~SqFt,data=Houseprice)

plot(Price~Bedrooms,data=Houseprice)

plot(Price~Bathrooms,data=Houseprice)

plot(Price~Offers,data=Houseprice)

plot(Price~Brick,data=Houseprice)

plot(Price~Neighborhood,data=Houseprice)

For Home ID, Offers, Brick, and Neighborhood, their variables are all nominal or categorical.

So it is nonsense to find numerical measure from those variable.

##Change non numeric value into numeric value
a1=rep(1,length(Houseprice$Brick))
a2=rep(0,length(Houseprice$Brick))
Houseprice$BrickYes=ifelse(Houseprice$Brick=="Yes",a1,a2)

b1=rep(1,length(Houseprice$Neighborhood))
b2=rep(0,length(Houseprice$Neighborhood))
Houseprice$NeighborhoodEast=ifelse(Houseprice$Neighborhood=="East",b1,b2)
Houseprice$NeighborhoodWest=ifelse(Houseprice$Neighborhood=="West",b1,b2)
HP=Houseprice[-7:-8]

m1 = lm(Price~., data = HP)
summary(m1)

## 
## Call:
## lm(formula = Price ~ ., data = HP)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27897.8  -6074.8    -48.7   5551.8  27536.4 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2037.726   8911.501   0.229 0.819524    
## HomeID             -11.456     25.387  -0.451 0.652616    
## SqFt                53.634      5.926   9.051 3.30e-15 ***
## Bedrooms          4136.461   1621.775   2.551 0.012023 *  
## Bathrooms         7975.157   2133.831   3.737 0.000287 ***
## Offers           -8350.128   1103.693  -7.566 8.96e-12 ***
## BrickYes         17313.540   1988.548   8.707 2.12e-14 ***
## NeighborhoodEast -1729.613   2433.756  -0.711 0.478675    
## NeighborhoodWest 20534.706   3176.051   6.465 2.33e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10050 on 119 degrees of freedom
## Multiple R-squared:  0.8688, Adjusted R-squared:   0.86 
## F-statistic: 98.54 on 8 and 119 DF,  p-value: < 2.2e-16

lm.beta(m1)

## 
## Call:
## lm(formula = Price ~ ., data = HP)
## 
## Standardized Coefficients::
##      (Intercept)           HomeID             SqFt         Bedrooms 
##        0.0000000       -0.0158165        0.4223306        0.1117606 
##        Bathrooms           Offers         BrickYes NeighborhoodEast 
##        0.1527110       -0.3323187        0.3037422       -0.0308560 
## NeighborhoodWest 
##        0.3531516

cor(HP)

##                       HomeID      Price       SqFt    Bedrooms
## HomeID            1.00000000  0.1081900 0.16855330 -0.06856814
## Price             0.10818998  1.0000000 0.55298224  0.52592606
## SqFt              0.16855330  0.5529822 1.00000000  0.48380711
## Bedrooms         -0.06856814  0.5259261 0.48380711  1.00000000
## Bathrooms         0.12769353  0.5232578 0.52274530  0.41455596
## Offers           -0.05359711 -0.3136359 0.33692335  0.11427061
## BrickYes          0.03737646  0.4528168 0.07979216  0.04638008
## NeighborhoodEast -0.07550453 -0.1429589 0.04563915 -0.09175034
## NeighborhoodWest  0.02687339  0.7140066 0.25075921  0.47147686
##                     Bathrooms      Offers    BrickYes NeighborhoodEast
## HomeID            0.127693528 -0.05359711  0.03737646     -0.075504526
## Price             0.523257758 -0.31363588  0.45281679     -0.142958878
## SqFt              0.522745301  0.33692335  0.07979216      0.045639151
## Bedrooms          0.414555956  0.11427061  0.04638008     -0.091750338
## Bathrooms         1.000000000  0.14379340  0.17197691     -0.001247208
## Offers            0.143793404  1.00000000 -0.14498606     -0.015602052
## BrickYes          0.171976913 -0.14498606  1.00000000      0.147563896
## NeighborhoodEast -0.001247208 -0.01560205  0.14756390      1.000000000
## NeighborhoodWest  0.285923136 -0.32742521  0.11579310     -0.487421308
##                  NeighborhoodWest
## HomeID                 0.02687339
## Price                  0.71400660
## SqFt                   0.25075921
## Bedrooms               0.47147686
## Bathrooms              0.28592314
## Offers                -0.32742521
## BrickYes               0.11579310
## NeighborhoodEast      -0.48742131
## NeighborhoodWest       1.00000000

Standardized regression

y = -0.016X1 + 0.422X2 + 0.112X3 + 0.153X4 - 0.332X5 + 0.304X6 - .031X7 + 0.353X8

This is a good mode that represents relationship among price and other factor. Its adjusted R-squared is .86, which is high.

library(leaps)
x = HP[,2:9]
y = HP[,1]
out = summary(regsubsets(x,y,nbest = 2, nvmax = ncol(x)))
tab=cbind(out$which,out$rsq,out$adjr2,out$cp)
tab

##   (Intercept) Price SqFt Bedrooms Bathrooms Offers BrickYes
## 1           1     0    1        0         0      0        0
## 1           1     0    0        0         1      0        0
## 2           1     0    1        1         0      0        0
## 2           1     0    1        0         0      1        0
## 3           1     0    1        1         0      1        0
## 3           1     0    1        1         0      0        0
## 4           1     0    1        1         0      1        0
## 4           1     0    1        1         1      1        0
## 5           1     0    1        1         0      1        0
## 5           1     0    1        1         1      1        0
## 6           1     0    1        1         1      1        0
## 6           1     1    1        1         1      1        0
## 7           1     1    1        1         1      1        0
## 7           1     0    1        1         1      1        1
## 8           1     1    1        1         1      1        1
##   NeighborhoodEast NeighborhoodWest                                
## 1                0                0 0.02841022 0.020699186 5.087346
## 1                0                0 0.01630564 0.008498539 6.695584
## 2                0                0 0.05783147 0.042756774 3.178380
## 2                0                0 0.04215579 0.026830282 5.261082
## 3                0                0 0.07411571 0.051715287 3.014823
## 3                1                0 0.06917879 0.046658925 3.670752
## 4                1                0 0.08662784 0.056924677 3.352437
## 4                0                0 0.08085054 0.050959497 4.120021
## 5                1                1 0.09366017 0.056515094 4.418108
## 5                1                0 0.09331740 0.056158273 4.463649
## 6                1                1 0.10250879 0.058005098 5.242462
## 6                1                0 0.09734382 0.052584014 5.928690
## 7                1                1 0.10290746 0.050577060 7.189494
## 7                1                1 0.10280096 0.050464345 7.203645
## 8                1                1 0.10433371 0.044120849 9.000000

cross-validation for the model on all variables

n=length(HP$Price)
diff=dim(n)
percdiff=dim(n)
for (k in 1:n) {
  train1=c(1:n)
  train=train1[train1!=k]
  ## the R expression "train1[train1!=k]" picks from train1 those 
  ## elements that are different from k and stores those elements in the
  ## object train. 
  ## For k=1, train consists of elements that are different from 1; that 
  ## is 2, 3, ???, n.
  m2=lm(Price~.,data = HP[train1,])
  pred=predict(m2,newdat=HP[train,])
  obs=HP$Price[-train]
  diff[k]=obs-pred
  percdiff[k]=abs(diff[k])/obs
}
me=mean(diff)
rmse=sqrt(mean(diff**2))
mape=100*(mean(percdiff))

##mean error
me

## [1] 26502.74

##root mean square error
rmse

## [1] 37724.98

##mean absolute percent error
mape

## [1] 20.42654

cross-validation for the model on SqFt only

me1  # mean error

## [1] 0.1045893

rmse1 # root mean square error

## [1] 22634.2

mape1 # mean absolute percent error

## [1] 14.3107

The cross validation for the model on SqFt only has low mean error, root mean squre error, and mean absolute percent error when it compered with the cross validation model with all other variables.

Data2

## 
## Call:
## lm(formula = AmountSpent ~ ., data = mkt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1648.11  -286.72   -12.63   218.21  2771.25 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.228e+02  9.341e+01  -5.596 3.17e-08 ***
## Salary         1.883e-02  1.245e-03  15.124  < 2e-16 ***
## Children      -2.683e+02  2.502e+01 -10.723  < 2e-16 ***
## Catalogs       4.052e+01  2.868e+00  14.128  < 2e-16 ***
## AgeOld        -4.827e+01  6.189e+01  -0.780    0.436    
## AgeMiddle     -8.965e+01  5.874e+01  -1.526    0.127    
## GenderMale    -5.370e+01  3.802e+01  -1.413    0.158    
## OwnHomeOwn     1.829e+01  4.151e+01   0.441    0.660    
## MarriedSingle  1.950e+01  4.981e+01   0.392    0.696    
## LocationFar    6.090e+02  4.399e+01  13.845  < 2e-16 ***
## HistoryHigh    3.446e+02  5.996e+01   5.746 1.38e-08 ***
## HistoryLow     7.704e+01  5.889e+01   1.308    0.191    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 463.5 on 685 degrees of freedom
##   (303 observations deleted due to missingness)
## Multiple R-squared:  0.7887, Adjusted R-squared:  0.7853 
## F-statistic: 232.5 on 11 and 685 DF,  p-value: < 2.2e-16

This model represents the relashionship among amount spent and other factors with high adjusted R square, which is .78.

Standardized Coefficients::

(Intercept)	Salary	Children	Catalogs	AgeOld	AgeMiddle	GenderMale	OwnHomeOwn	MarriedSingle
0.000000000	0.578037480	-0.280288205	0.267647275	-0.020694916	-0.044805928	-0.026852942	0.009132032	0.009738261

LocationFar	HistoryHigh	HistoryLow
0.277987717	0.166025949	0.036238379

cor(mkt)

##                     Salary     Children    Catalogs AmountSpent
## Salary         1.000000000  0.049663163  0.18355086   0.6995957
## Children       0.049663163  1.000000000 -0.11345543  -0.2223082
## Catalogs       0.183550862 -0.113455428  1.00000000   0.4726499
## AmountSpent    0.699595707 -0.222308170  0.47264989   1.0000000
## AgeOld         0.004346925 -0.385445575  0.03674313   0.1138452
## AgeMiddle      0.529051616  0.244719648  0.11408342   0.3013953
## GenderMale     0.261492181 -0.105469083  0.08735077   0.2016902
## OwnHomeOwn     0.460736395 -0.032274083  0.09313151   0.3508080
## MarriedSingle -0.675633080 -0.009770249 -0.13705989  -0.4758800
## LocationFar   -0.037127094  0.002391455  0.12858075   0.2526157
## HistoryHigh             NA           NA          NA          NA
## HistoryLow              NA           NA          NA          NA
##                     AgeOld   AgeMiddle   GenderMale  OwnHomeOwn
## Salary         0.004346925  0.52905162  0.261492181  0.46073640
## Children      -0.385445575  0.24471965 -0.105469083 -0.03227408
## Catalogs       0.036743130  0.11408342  0.087350767  0.09313151
## AmountSpent    0.113845240  0.30139535  0.201690213  0.35080800
## AgeOld         1.000000000 -0.51599165 -0.125200486  0.21422825
## AgeMiddle     -0.515991647  1.00000000  0.204232847  0.25164907
## GenderMale    -0.125200486  0.20423285  1.000000000  0.08443332
## OwnHomeOwn     0.214228248  0.25164907  0.084433317  1.00000000
## MarriedSingle -0.124300718 -0.15595721 -0.116057285 -0.26400932
## LocationFar    0.013920394 -0.04108406 -0.005553971 -0.03369129
## HistoryHigh             NA          NA           NA          NA
## HistoryLow              NA          NA           NA          NA
##               MarriedSingle  LocationFar HistoryHigh HistoryLow
## Salary         -0.675633080 -0.037127094          NA         NA
## Children       -0.009770249  0.002391455          NA         NA
## Catalogs       -0.137059886  0.128580754          NA         NA
## AmountSpent    -0.475879979  0.252615659          NA         NA
## AgeOld         -0.124300718  0.013920394          NA         NA
## AgeMiddle      -0.155957211 -0.041084058          NA         NA
## GenderMale     -0.116057285 -0.005553971          NA         NA
## OwnHomeOwn     -0.264009318 -0.033691291          NA         NA
## MarriedSingle   1.000000000  0.006964058          NA         NA
## LocationFar     0.006964058  1.000000000          NA         NA
## HistoryHigh              NA           NA           1         NA
## HistoryLow               NA           NA          NA          1

The amountspent is relatively correlated with salary.

mkt = mkt[-11:-12]

x1 = mkt[-4]
y1 = mkt$AmountSpent
out1 = summary(regsubsets(x1,y1,nbest = 2, nvmax = ncol(x1)))
tab1 = cbind(out1$which,out1$rsq,out1$adjr2,out1$cp)
tab1

##   (Intercept) Salary Children Catalogs AgeOld AgeMiddle GenderMale
## 1           1      1        0        0      0         0          0
## 1           1      0        0        0      0         0          0
## 2           1      1        0        1      0         0          0
## 2           1      1        0        0      0         0          0
## 3           1      1        0        1      0         0          0
## 3           1      1        1        1      0         0          0
## 4           1      1        1        1      0         0          0
## 4           1      1        0        1      1         0          0
## 5           1      1        1        1      0         1          0
## 5           1      1        1        1      0         0          1
## 6           1      1        1        1      0         1          0
## 6           1      1        1        1      0         1          0
## 7           1      1        1        1      0         1          1
## 7           1      1        1        1      0         1          0
## 8           1      1        1        1      0         1          1
## 8           1      1        1        1      1         1          0
## 9           1      1        1        1      1         1          1
##   OwnHomeOwn MarriedSingle LocationFar                                
## 1          0             0           0 0.4894342 0.4889226  789.018996
## 1          0             1           0 0.2264618 0.2256867 1708.412115
## 2          0             0           0 0.6120659 0.6112877  362.279098
## 2          0             0           1 0.5671535 0.5662852  519.300013
## 3          0             0           1 0.6662321 0.6652268  174.905276
## 3          0             0           0 0.6584354 0.6574066  202.163925
## 4          0             0           1 0.7148385 0.7136921    6.969632
## 4          0             0           1 0.6754368 0.6741321  144.724169
## 5          0             0           1 0.7154341 0.7140026    6.887546
## 5          0             0           1 0.7152832 0.7138510    7.415125
## 6          0             1           1 0.7159702 0.7142540    7.013017
## 6          1             0           1 0.7158587 0.7141418    7.403040
## 7          0             1           1 0.7163588 0.7143573    7.654407
## 7          1             1           1 0.7163331 0.7143315    7.744201
## 8          1             1           1 0.7166807 0.7143935    8.529178
## 8          1             1           1 0.7164212 0.7141320    9.436323
## 9          1             1           1 0.7168320 0.7142578   10.000000

According to the table above, the salary is most effective factor.

Data 3

plot(Salary~Experience, data = GenDis)

plot(Salary~Gender, data = GenDis)

d1 = rep(1,length(GenDis$Gender))
d2 = rep(0,length(GenDis$Gender))

GenDis$GenderFemale = ifelse(GenDis$Gender == "Female", d1,d2)
GD=GenDis[-1]
m4 = lm(Salary ~ ., data = GD)
summary(m4)

## 
## Call:
## lm(formula = Salary ~ ., data = GD)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52779  -9806   -121   8347  60913 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   70280.6     2801.7  25.085  < 2e-16 ***
## Experience     1744.6      160.7  10.858  < 2e-16 ***
## GenderFemale -17020.6     2499.6  -6.809 1.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16910 on 205 degrees of freedom
## Multiple R-squared:  0.4413, Adjusted R-squared:  0.4359 
## F-statistic: 80.98 on 2 and 205 DF,  p-value: < 2.2e-16

lm.beta(m4)

## 
## Call:
## lm(formula = Salary ~ ., data = GD)
## 
## Standardized Coefficients::
##  (Intercept)   Experience GenderFemale 
##    0.0000000    0.5668653   -0.3555135

Adjusted R squre is low, less then .5, thus the model is not reliable enough.

cor(GD)

##              Experience     Salary GenderFemale
## Experience   1.00000000  0.5612383   0.01582798
## Salary       0.56123829  1.0000000  -0.34654117
## GenderFemale 0.01582798 -0.3465412   1.00000000

Experience and salary is somewhat related.

x2 = GD[,2:3]
y2 = GD[,1]
out2 = summary(regsubsets(x2,y2,nbest = 2, nvmax = ncol(x2)))
tab2 = cbind(out2$which,out2$rsq,out2$adjr2,out2$cp)
tab2

##   (Intercept) Salary GenderFemale                                    
## 1           1      1            0 0.3149884129  0.311663114  17.23611
## 1           1      0            1 0.0002505249 -0.004602628 118.88605
## 2           1      1            1 0.3652601506  0.359067567   3.00000

Homework#2

Shunya

2017年10月1日

Data #1

Relationship between Price and other variable

For Home ID, Offers, Brick, and Neighborhood, their variables are all nominal or categorical.

So it is nonsense to find numerical measure from those variable.

Standardized regression

y = -0.016X1 + 0.422X2 + 0.112X3 + 0.153X4 - 0.332X5 + 0.304X6 - .031X7 + 0.353X8

This is a good mode that represents relationship among price and other factor. Its adjusted R-squared is .86, which is high.

cross-validation for the model on all variables

cross-validation for the model on SqFt only

The cross validation for the model on SqFt only has low mean error, root mean squre error, and mean absolute percent error when it compered with the cross validation model with all other variables.

Data2

This model represents the relashionship among amount spent and other factors with high adjusted R square, which is .78.

Standardized Coefficients::

The amountspent is relatively correlated with salary.

According to the table above, the salary is most effective factor.

Data 3

Adjusted R squre is low, less then .5, thus the model is not reliable enough.

According to the table above, there is no significant relashionship among those variables.