Data #1
Relationship between Price and other variable
plot(Price~SqFt,data=Houseprice)

plot(Price~Bedrooms,data=Houseprice)

plot(Price~Bathrooms,data=Houseprice)

plot(Price~Offers,data=Houseprice)

plot(Price~Brick,data=Houseprice)

plot(Price~Neighborhood,data=Houseprice)

For Home ID, Offers, Brick, and Neighborhood, their variables are all nominal or categorical.
So it is nonsense to find numerical measure from those variable.
##Change non numeric value into numeric value
a1=rep(1,length(Houseprice$Brick))
a2=rep(0,length(Houseprice$Brick))
Houseprice$BrickYes=ifelse(Houseprice$Brick=="Yes",a1,a2)
b1=rep(1,length(Houseprice$Neighborhood))
b2=rep(0,length(Houseprice$Neighborhood))
Houseprice$NeighborhoodEast=ifelse(Houseprice$Neighborhood=="East",b1,b2)
Houseprice$NeighborhoodWest=ifelse(Houseprice$Neighborhood=="West",b1,b2)
HP=Houseprice[-7:-8]
m1 = lm(Price~., data = HP)
summary(m1)
##
## Call:
## lm(formula = Price ~ ., data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27897.8 -6074.8 -48.7 5551.8 27536.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2037.726 8911.501 0.229 0.819524
## HomeID -11.456 25.387 -0.451 0.652616
## SqFt 53.634 5.926 9.051 3.30e-15 ***
## Bedrooms 4136.461 1621.775 2.551 0.012023 *
## Bathrooms 7975.157 2133.831 3.737 0.000287 ***
## Offers -8350.128 1103.693 -7.566 8.96e-12 ***
## BrickYes 17313.540 1988.548 8.707 2.12e-14 ***
## NeighborhoodEast -1729.613 2433.756 -0.711 0.478675
## NeighborhoodWest 20534.706 3176.051 6.465 2.33e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10050 on 119 degrees of freedom
## Multiple R-squared: 0.8688, Adjusted R-squared: 0.86
## F-statistic: 98.54 on 8 and 119 DF, p-value: < 2.2e-16
lm.beta(m1)
##
## Call:
## lm(formula = Price ~ ., data = HP)
##
## Standardized Coefficients::
## (Intercept) HomeID SqFt Bedrooms
## 0.0000000 -0.0158165 0.4223306 0.1117606
## Bathrooms Offers BrickYes NeighborhoodEast
## 0.1527110 -0.3323187 0.3037422 -0.0308560
## NeighborhoodWest
## 0.3531516
cor(HP)
## HomeID Price SqFt Bedrooms
## HomeID 1.00000000 0.1081900 0.16855330 -0.06856814
## Price 0.10818998 1.0000000 0.55298224 0.52592606
## SqFt 0.16855330 0.5529822 1.00000000 0.48380711
## Bedrooms -0.06856814 0.5259261 0.48380711 1.00000000
## Bathrooms 0.12769353 0.5232578 0.52274530 0.41455596
## Offers -0.05359711 -0.3136359 0.33692335 0.11427061
## BrickYes 0.03737646 0.4528168 0.07979216 0.04638008
## NeighborhoodEast -0.07550453 -0.1429589 0.04563915 -0.09175034
## NeighborhoodWest 0.02687339 0.7140066 0.25075921 0.47147686
## Bathrooms Offers BrickYes NeighborhoodEast
## HomeID 0.127693528 -0.05359711 0.03737646 -0.075504526
## Price 0.523257758 -0.31363588 0.45281679 -0.142958878
## SqFt 0.522745301 0.33692335 0.07979216 0.045639151
## Bedrooms 0.414555956 0.11427061 0.04638008 -0.091750338
## Bathrooms 1.000000000 0.14379340 0.17197691 -0.001247208
## Offers 0.143793404 1.00000000 -0.14498606 -0.015602052
## BrickYes 0.171976913 -0.14498606 1.00000000 0.147563896
## NeighborhoodEast -0.001247208 -0.01560205 0.14756390 1.000000000
## NeighborhoodWest 0.285923136 -0.32742521 0.11579310 -0.487421308
## NeighborhoodWest
## HomeID 0.02687339
## Price 0.71400660
## SqFt 0.25075921
## Bedrooms 0.47147686
## Bathrooms 0.28592314
## Offers -0.32742521
## BrickYes 0.11579310
## NeighborhoodEast -0.48742131
## NeighborhoodWest 1.00000000
Standardized regression
y = -0.016X1 + 0.422X2 + 0.112X3 + 0.153X4 - 0.332X5 + 0.304X6 - .031X7 + 0.353X8
This is a good mode that represents relationship among price and other factor. Its adjusted R-squared is .86, which is high.
library(leaps)
x = HP[,2:9]
y = HP[,1]
out = summary(regsubsets(x,y,nbest = 2, nvmax = ncol(x)))
tab=cbind(out$which,out$rsq,out$adjr2,out$cp)
tab
## (Intercept) Price SqFt Bedrooms Bathrooms Offers BrickYes
## 1 1 0 1 0 0 0 0
## 1 1 0 0 0 1 0 0
## 2 1 0 1 1 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 0 1 1 0 1 0
## 3 1 0 1 1 0 0 0
## 4 1 0 1 1 0 1 0
## 4 1 0 1 1 1 1 0
## 5 1 0 1 1 0 1 0
## 5 1 0 1 1 1 1 0
## 6 1 0 1 1 1 1 0
## 6 1 1 1 1 1 1 0
## 7 1 1 1 1 1 1 0
## 7 1 0 1 1 1 1 1
## 8 1 1 1 1 1 1 1
## NeighborhoodEast NeighborhoodWest
## 1 0 0 0.02841022 0.020699186 5.087346
## 1 0 0 0.01630564 0.008498539 6.695584
## 2 0 0 0.05783147 0.042756774 3.178380
## 2 0 0 0.04215579 0.026830282 5.261082
## 3 0 0 0.07411571 0.051715287 3.014823
## 3 1 0 0.06917879 0.046658925 3.670752
## 4 1 0 0.08662784 0.056924677 3.352437
## 4 0 0 0.08085054 0.050959497 4.120021
## 5 1 1 0.09366017 0.056515094 4.418108
## 5 1 0 0.09331740 0.056158273 4.463649
## 6 1 1 0.10250879 0.058005098 5.242462
## 6 1 0 0.09734382 0.052584014 5.928690
## 7 1 1 0.10290746 0.050577060 7.189494
## 7 1 1 0.10280096 0.050464345 7.203645
## 8 1 1 0.10433371 0.044120849 9.000000
cross-validation for the model on all variables
n=length(HP$Price)
diff=dim(n)
percdiff=dim(n)
for (k in 1:n) {
train1=c(1:n)
train=train1[train1!=k]
## the R expression "train1[train1!=k]" picks from train1 those
## elements that are different from k and stores those elements in the
## object train.
## For k=1, train consists of elements that are different from 1; that
## is 2, 3, ???, n.
m2=lm(Price~.,data = HP[train1,])
pred=predict(m2,newdat=HP[train,])
obs=HP$Price[-train]
diff[k]=obs-pred
percdiff[k]=abs(diff[k])/obs
}
me=mean(diff)
rmse=sqrt(mean(diff**2))
mape=100*(mean(percdiff))
##mean error
me
## [1] 26502.74
##root mean square error
rmse
## [1] 37724.98
##mean absolute percent error
mape
## [1] 20.42654
cross-validation for the model on SqFt only
me1 # mean error
## [1] 0.1045893
rmse1 # root mean square error
## [1] 22634.2
mape1 # mean absolute percent error
## [1] 14.3107
The cross validation for the model on SqFt only has low mean error, root mean squre error, and mean absolute percent error when it compered with the cross validation model with all other variables.
This model represents the relashionship among amount spent and other factors with high adjusted R square, which is .78.
Standardized Coefficients::
| 0.000000000 |
0.578037480 |
-0.280288205 |
0.267647275 |
-0.020694916 |
-0.044805928 |
-0.026852942 |
0.009132032 |
0.009738261 |
| 0.277987717 |
0.166025949 |
0.036238379 |
cor(mkt)
## Salary Children Catalogs AmountSpent
## Salary 1.000000000 0.049663163 0.18355086 0.6995957
## Children 0.049663163 1.000000000 -0.11345543 -0.2223082
## Catalogs 0.183550862 -0.113455428 1.00000000 0.4726499
## AmountSpent 0.699595707 -0.222308170 0.47264989 1.0000000
## AgeOld 0.004346925 -0.385445575 0.03674313 0.1138452
## AgeMiddle 0.529051616 0.244719648 0.11408342 0.3013953
## GenderMale 0.261492181 -0.105469083 0.08735077 0.2016902
## OwnHomeOwn 0.460736395 -0.032274083 0.09313151 0.3508080
## MarriedSingle -0.675633080 -0.009770249 -0.13705989 -0.4758800
## LocationFar -0.037127094 0.002391455 0.12858075 0.2526157
## HistoryHigh NA NA NA NA
## HistoryLow NA NA NA NA
## AgeOld AgeMiddle GenderMale OwnHomeOwn
## Salary 0.004346925 0.52905162 0.261492181 0.46073640
## Children -0.385445575 0.24471965 -0.105469083 -0.03227408
## Catalogs 0.036743130 0.11408342 0.087350767 0.09313151
## AmountSpent 0.113845240 0.30139535 0.201690213 0.35080800
## AgeOld 1.000000000 -0.51599165 -0.125200486 0.21422825
## AgeMiddle -0.515991647 1.00000000 0.204232847 0.25164907
## GenderMale -0.125200486 0.20423285 1.000000000 0.08443332
## OwnHomeOwn 0.214228248 0.25164907 0.084433317 1.00000000
## MarriedSingle -0.124300718 -0.15595721 -0.116057285 -0.26400932
## LocationFar 0.013920394 -0.04108406 -0.005553971 -0.03369129
## HistoryHigh NA NA NA NA
## HistoryLow NA NA NA NA
## MarriedSingle LocationFar HistoryHigh HistoryLow
## Salary -0.675633080 -0.037127094 NA NA
## Children -0.009770249 0.002391455 NA NA
## Catalogs -0.137059886 0.128580754 NA NA
## AmountSpent -0.475879979 0.252615659 NA NA
## AgeOld -0.124300718 0.013920394 NA NA
## AgeMiddle -0.155957211 -0.041084058 NA NA
## GenderMale -0.116057285 -0.005553971 NA NA
## OwnHomeOwn -0.264009318 -0.033691291 NA NA
## MarriedSingle 1.000000000 0.006964058 NA NA
## LocationFar 0.006964058 1.000000000 NA NA
## HistoryHigh NA NA 1 NA
## HistoryLow NA NA NA 1
According to the table above, the salary is most effective factor.
Data 3
plot(Salary~Experience, data = GenDis)

plot(Salary~Gender, data = GenDis)

d1 = rep(1,length(GenDis$Gender))
d2 = rep(0,length(GenDis$Gender))
GenDis$GenderFemale = ifelse(GenDis$Gender == "Female", d1,d2)
GD=GenDis[-1]
m4 = lm(Salary ~ ., data = GD)
summary(m4)
##
## Call:
## lm(formula = Salary ~ ., data = GD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52779 -9806 -121 8347 60913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70280.6 2801.7 25.085 < 2e-16 ***
## Experience 1744.6 160.7 10.858 < 2e-16 ***
## GenderFemale -17020.6 2499.6 -6.809 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16910 on 205 degrees of freedom
## Multiple R-squared: 0.4413, Adjusted R-squared: 0.4359
## F-statistic: 80.98 on 2 and 205 DF, p-value: < 2.2e-16
lm.beta(m4)
##
## Call:
## lm(formula = Salary ~ ., data = GD)
##
## Standardized Coefficients::
## (Intercept) Experience GenderFemale
## 0.0000000 0.5668653 -0.3555135
Adjusted R squre is low, less then .5, thus the model is not reliable enough.
cor(GD)
## Experience Salary GenderFemale
## Experience 1.00000000 0.5612383 0.01582798
## Salary 0.56123829 1.0000000 -0.34654117
## GenderFemale 0.01582798 -0.3465412 1.00000000
According to the table above, there is no significant relashionship among those variables.