## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: Matrix
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Using Y1 and X1
Y <- c(20.3, 19.1, 19.3, 20.9, 22.0, 23.5, 13.8, 18.8, 20.9, 18.6,
22.3, 17.6, 20.8, 28.7, 15.2, 20.9, 18.4, 10.3, 26.3, 28.1)
X <- c(9.3, 4.1, 22.4, 9.1, 15.8, 7.1, 15.9, 6.9, 16.0, 6.7, 8.2,
16.0, 6.4, 11.8, 3.5, 21.7, 12.2, 9.3, 8.0, 6.2)
df <- data.frame(X, Y)
We need to calculate the probability that a random number in X is greater than the 3rd quartile, given that we have a random number in Y that is greater than the 1st quartile. So first, we’ll need to define the 3rd quartile of X and the 1st quartile of Y, using the built in R function quantile.
XQ3 <- quantile(df$X, 0.75)
YQ1 <- quantile(df$Y, 0.25)
We can test these values very easily, given that each set of numbers is 20. If XQ3 is the 3rd quartile, then 25% of the numbers in X will be greater than it. Likewise, if YQ1 is the 1st quartile, then 75% of the numbers in Y will be greater than it.
length(df[df$X>XQ3, 1]) / length(df$X)
## [1] 0.25
length(df[df$Y>YQ1, 2]) / length(df$Y)
## [1] 0.75
So what is \(P(X>x|Y>y)\)? We can easily do this just by filtering the dataset for the values that only follow that conditional, then dividing by the total number of entries.
filter(df, X > XQ3 & Y > YQ1)
## X Y
## 1 22.4 19.3
## 2 16.0 20.9
## 3 21.7 20.9
We see that only three values of X meet this requirement, so \(P(X>x|Y>y) = \frac{3}{20} = 0.15\)
This one isn’t conditional, so we’re simply looking for all values that satisfy \(P(X>x)\) and all values that satisfy \(P(Y>y)\). Given that they’re quantiles, however, we know that this is simply \(0.75 * 0.25\), but we can doublecheck against our data.
p.xg3 <- filter(df, X > XQ3) %>%
tally()/nrow(df)
p.yg1 <- filter(df, Y > YQ1) %>%
tally()/nrow(df)
p.xg3
## n
## 1 0.25
p.yg1
## n
## 1 0.75
p.xg3 * p.yg1
## n
## 1 0.1875
And indeed, \(P(X>x, Y>y) = 0.1875\)
ans.b <- p.xg3 * p.yg1
ans.b
## n
## 1 0.1875
We know \(P(Y>y)\), but now we must calculate \(P(X<x)\). This is actually simple, as it should almost be \(1-P(X>x)\) It might not be exact, if the 3rd quartile value of X happens to also exist in the dataset.
filter(df, X < XQ3) %>%
tally()/nrow(df)
## n
## 1 0.75
So as with our first problem, we can simply filter the dataset and find the values that satisfy our conditional. Normally we could use math to figure this out, like \(P(X < x|Y>y) = \frac{P(X < x \text{ and } Y >y)}{P(Y>y)}\), but that usually requires knowing the other conditional.
filter(df, X < XQ3 & Y > YQ1)
## X Y
## 1 9.3 20.3
## 2 4.1 19.1
## 3 9.1 20.9
## 4 15.8 22.0
## 5 7.1 23.5
## 6 6.9 18.8
## 7 6.7 18.6
## 8 8.2 22.3
## 9 6.4 20.8
## 10 11.8 28.7
## 11 8.0 26.3
## 12 6.2 28.1
filter(df, X < XQ3 & Y > YQ1) %>%
tally()/nrow(df)
## n
## 1 0.6
\(P(X < X | Y > y) = 0.6\)
a1 <- filter(df, X <= XQ3 & Y <= YQ1) %>%
tally()
a2 <- filter(df, X <= XQ3 & Y > YQ1) %>%
tally()
b1 <- filter(df, X > XQ3 & Y <= YQ1) %>%
tally()
b2 <- filter(df, X > XQ3 & Y > YQ1) %>%
tally()
count.tab <- matrix(c(a1, a2, a1+a2, b1, b2, b1+b2, a1+b1, a2+b2, a1+a2+b1+b2),
ncol = 3, nrow = 3)
colnames(count.tab) <- c("X <= 3rd Quartile", "X >3rd Quartile", "Total")
rownames(count.tab) <- c("Y <= 1st Quartile", "Y > 1st Quartile", "Total")
count.tab
## X <= 3rd Quartile X >3rd Quartile Total
## Y <= 1st Quartile 3 2 5
## Y > 1st Quartile 12 3 15
## Total 15 5 20
Well, looking at our table \(A = 5/20 = 0.25\) and \(B = 15/20 = 0.75\). That makes \(P(A)P(B) = 0.1875\). What about \(P(AB)\)? According to the table that’s \(3/20 = 0.15\), which we calculated earlier. \(0.1875 \neq 0.15\), so they’re not independent.
Verifying with Chi Squared Test
chisq.test(df)
##
## Pearson's Chi-squared test
##
## data: df
## X-squared = 40.234, df = 19, p-value = 0.003048
The p-value is less than the typical significance level of 0.05, so we reject the null hypothesis that the data is independent.
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 20 :536 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 60 :299 FV : 65 1st Qu.: 59.00
## Median : 730.5 50 :144 RH : 16 Median : 69.00
## Mean : 730.5 120 : 87 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 30 : 69 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 160 : 63 Max. :313.00
## (Other):262 NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual OverallCond
## Norm :1445 1Fam :1220 1Story :726 5 :397 5 :821
## Feedr : 6 2fmCon: 31 2Story :445 6 :374 6 :252
## Artery : 2 Duplex: 52 1.5Fin :154 7 :319 7 :205
## PosN : 2 Twnhs : 43 SLvl : 65 8 :168 8 : 72
## RRNn : 2 TwnhsE: 114 SFoyer : 37 4 :116 4 : 57
## PosA : 1 1.5Unf : 14 9 : 43 3 : 25
## (Other): 2 (Other): 19 (Other): 43 (Other): 28
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## 2006 : 67 1950 :178 Flat : 13 CompShg:1434 VinylSd:515
## 2005 : 64 2006 : 97 Gable :1141 Tar&Grv: 11 HdBoard:222
## 2004 : 54 2007 : 76 Gambrel: 11 WdShngl: 6 MetalSd:220
## 2007 : 49 2005 : 73 Hip : 286 WdShake: 5 Wd Sdng:206
## 2003 : 45 2004 : 62 Mansard: 7 ClyTile: 1 Plywood:108
## 1976 : 33 2000 : 55 Shed : 2 Membran: 1 CemntBd: 61
## (Other):1148 (Other):919 (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## VinylSd:504 BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28
## HdBoard:207 None :864 Median : 0.0 Gd:488 Gd: 146
## Wd Sdng:197 Stone :128 Mean : 103.7 TA:906 Po: 1
## Plywood:142 NA's : 8 3rd Qu.: 166.0 TA:1282
## CmentBd: 60 Max. :1600.0
## (Other):136 NA's :8
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## BrkTil:146 Ex :121 Fa : 45 Av :221 ALQ :220
## CBlock:634 Fa : 35 Gd : 65 Gd :134 BLQ :148
## PConc :647 Gd :618 Po : 2 Mn :114 GLQ :418
## Slab : 24 TA :649 TA :1311 No :953 LwQ : 74
## Stone : 6 NA's: 37 NA's: 37 NA's: 38 Rec :133
## Wood : 3 Unf :430
## NA's: 37
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ : 19 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 383.5 GLQ : 14 Median : 0.00 Median : 477.5
## Mean : 443.6 LwQ : 46 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :5644.0 Unf :1256 Max. :1474.00 Max. :2336.0
## NA's: 38
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0.0 Floor: 1 Ex:741 N: 95 FuseA: 94
## 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365 FuseF: 27
## Median : 991.5 GasW : 18 Gd:241 FuseP: 3
## Mean :1057.4 Grav : 7 Po: 1 Mix : 1
## 3rd Qu.:1298.2 OthW : 2 TA:428 SBrkr:1334
## Max. :6110.0 Wall : 4 NA's : 1
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0 Min. : 0.000 Min. : 334
## 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130
## Median :1087 Median : 0 Median : 0.000 Median :1464
## Mean :1163 Mean : 347 Mean : 5.845 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :572.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## 0:856 0:1378 0: 9 0:913 3 :804 0: 1
## 1:588 1: 80 1:650 1:535 2 :358 1:1392
## 2: 15 2: 2 2:768 2: 12 4 :213 2: 65
## 3: 1 3: 33 1 : 50 3: 2
## 5 : 21
## 6 : 7
## (Other): 7
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Ex:100 6 :402 Maj1: 14 0:690 Ex : 24
## Fa: 39 7 :329 Maj2: 5 1:650 Fa : 33
## Gd:586 5 :275 Min1: 31 2:115 Gd :380
## TA:735 8 :187 Min2: 34 3: 5 Po : 20
## 4 : 97 Mod : 15 TA :313
## 9 : 75 Sev : 1 NA's:690
## (Other): 95 Typ :1360
## GarageType GarageYrBlt GarageFinish GarageCars
## 2Types : 6 2005 : 65 Fin :352 Min. :0.000
## Attchd :870 2006 : 59 RFn :422 1st Qu.:1.000
## Basment: 19 2004 : 53 Unf :605 Median :2.000
## BuiltIn: 88 2003 : 50 NA's: 81 Mean :1.767
## CarPort: 9 2007 : 49 3rd Qu.:2.000
## Detchd :387 (Other):1103 Max. :4.000
## NA's : 81 NA's : 81
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## Min. : 0.0 Ex : 3 Ex : 2 N: 90 Min. : 0.00
## 1st Qu.: 334.5 Fa : 48 Fa : 35 P: 30 1st Qu.: 0.00
## Median : 480.0 Gd : 14 Gd : 9 Y:1340 Median : 0.00
## Mean : 473.0 Po : 3 Po : 7 Mean : 94.24
## 3rd Qu.: 576.0 TA :1311 TA :1326 3rd Qu.:168.00
## Max. :1418.0 NA's: 81 NA's: 81 Max. :857.00
##
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 25.00 Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 46.66 Mean : 21.95 Mean : 3.41 Mean : 15.06
## 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :547.00 Max. :552.00 Max. :508.00 Max. :480.00
##
## PoolArea PoolQC Fence MiscFeature MiscVal
## Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2 Min. : 0.00
## 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2 1st Qu.: 0.00
## Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49 Median : 0.00
## Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1 Mean : 43.49
## 3rd Qu.: 0.000 NA's :1179 NA's:1406 3rd Qu.: 0.00
## Max. :738.000 Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition SalePrice
## 6 :253 2006:314 WD :1267 Abnorml: 101 Min. : 34900
## 7 :234 2007:329 New : 122 AdjLand: 4 1st Qu.:129975
## 5 :204 2008:304 COD : 43 Alloca : 12 Median :163000
## 4 :141 2009:338 ConLD : 9 Family : 20 Mean :180921
## 8 :122 2010:175 ConLI : 5 Normal :1198 3rd Qu.:214000
## 3 :106 ConLw : 5 Partial: 125 Max. :755000
## (Other):400 (Other): 9
plot(house.train$SalePrice~house.train$X1stFlrSF, main = "1st Floor Square Feet vs Sale Price",
xlab = "Square Feet", ylab = "Sale Price", col = "purple")
plot(house.train$SalePrice~house.train$Fireplaces, main = "Num of Fireplaces vs Sale Price",
xlab = "Fireplaces", ylab = "Sale Price", col = "orange")
cor.mat <- cor(as.matrix(house.train[, c("LotArea", "TotalBsmtSF", "WoodDeckSF")]))
round(cor.mat, 2)
## LotArea TotalBsmtSF WoodDeckSF
## LotArea 1.00 0.26 0.17
## TotalBsmtSF 0.26 1.00 0.23
## WoodDeckSF 0.17 0.23 1.00
cor.test(house.train$LotArea, house.train$TotalBsmtSF)
##
## Pearson's product-moment correlation
##
## data: house.train$LotArea and house.train$TotalBsmtSF
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2123727 0.3080138
## sample estimates:
## cor
## 0.2608331
cor.test(house.train$LotArea, house.train$WoodDeckSF)
##
## Pearson's product-moment correlation
##
## data: house.train$LotArea and house.train$WoodDeckSF
## t = 6.6549, df = 1458, p-value = 4.001e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1214653 0.2210529
## sample estimates:
## cor
## 0.1716977
cor.test(house.train$TotalBsmtSF, house.train$WoodDeckSF)
##
## Pearson's product-moment correlation
##
## data: house.train$TotalBsmtSF and house.train$WoodDeckSF
## t = 9.1079, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1828933 0.2799882
## sample estimates:
## cor
## 0.2320186
Since the correlation between all three variables are 0, I wouldn’t be worried about the familywise error. Even with a Bonferroni correction for those 3, the p-value is still beneath it as it only becomes \(\frac{0.5}{3} = 0.167\).
inv.cor <- solve(cor.mat)
inv.cor
## LotArea TotalBsmtSF WoodDeckSF
## LotArea 1.0882554 -0.2541836 -0.1278756
## TotalBsmtSF -0.2541836 1.1162651 -0.2153515
## WoodDeckSF -0.1278756 -0.2153515 1.0719215
Correlation Matrix multiplied by Precision Matrix
a <- cor.mat %*% inv.cor
a
## LotArea TotalBsmtSF WoodDeckSF
## LotArea 1.000000e+00 1.387779e-17 0
## TotalBsmtSF 2.081668e-17 1.000000e+00 0
## WoodDeckSF 2.775558e-17 -8.326673e-17 1
Precision Matrix multiplied by Correlation Matrix
b <- inv.cor %*% cor.mat
b
## LotArea TotalBsmtSF WoodDeckSF
## LotArea 1.000000e+00 7.632783e-17 2.775558e-17
## TotalBsmtSF -4.163336e-17 1.000000e+00 -8.326673e-17
## WoodDeckSF 0.000000e+00 0.000000e+00 1.000000e+00
LU Decomposition of both
a.lu <- expand(lu(t(a)))
a.l <- a.lu$U
a.l[lower.tri(a.lu$U)] <- a.lu$L[lower.tri(a.lu$L)]
a.l
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.000000e+00 2.081668e-17 2.775558e-17
## [2,] 1.387779e-17 1.000000e+00 -8.326673e-17
## [3,] 0.000000e+00 0.000000e+00 1.000000e+00
b.lu <- expand(lu(t(b)))
b.l <- b.lu$U
b.l[lower.tri(b.lu$U)] <- b.lu$L[lower.tri(b.lu$L)]
b.l
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.000000e+00 -4.163336e-17 0
## [2,] 7.632783e-17 1.000000e+00 0
## [3,] 2.775558e-17 -8.326673e-17 1
hist(house.train$X1stFlrSF)
summary(house.train$X1stFlrSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 882 1087 1163 1391 4692
The mean value is 1163, which is larger than the median value of 1087. That indicates a right-skewness to this variable, and the min is already above zero.
x <- fitdistr(house.train$X1stFlrSF, densfun = "exponential")
lambda <- x$estimate
lambda
## rate
## 0.0008601213
lam.dist <- rexp(1000, lambda)
par(mfrow = c(1, 2))
hist(lam.dist, main = "Lambda Hist")
hist(house.train$X1stFlrSF, main = "Original Hist")
per5 <- qexp(0.05, rate = lambda, lower.tail = TRUE, log.p = FALSE)
per95 <- qexp(0.95, rate = lambda, lower.tail = TRUE, log.p = FALSE)
conf.int95 <- qnorm(0.95, mean(house.train$X1stFlrSF), sd(house.train$X1stFlrSF))
per5
## [1] 59.63495
per95
## [1] 3482.918
conf.int95
## [1] 1798.507
quantile(house.train$X1stFlrSF, c(0.05, 0.95))
## 5% 95%
## 672.95 1831.25
We can see the difference between a simulated PDF and the actual data. The simulated data is a little more perfectly right skewed when compared to the actual data, which is somewhat messier. It’s nice to use to talk about in a general, academic sense, or to perhaps identify trends in the data, but that might be about it.
Since there is a lot of variables, we’ll simply utilize a backward stepwise regression, eliminating all variables that are deemed insignificant. There are too many variables and too short a time to do a lot of transformations on the data.
clean <- function(df){
# Split factor variables from non-factor variables
x <- sapply(df, function(x) is.factor(x))
dat <- df[, x]
non <- df[, !x]
# Replace NA with 0 for numeric variables
n <- sapply(non, function(x) anyNA(x))
final <- non[, !n]
non <- non[, n]
non[is.na(non)] <- 0
final <- cbind(final, non)
# Split factor variables with NA's
y <- sapply(dat, function(x) anyNA(x))
z <- dat[, !y]
for(i in 1:length(z)){
z[, i] <- factor(z[, i], levels = c(levels(z[, i]), "Z"))
}
final <- cbind(final, z)
dat <- dat[, y]
# Force NA to be a factor
# Rename it to 'Z'
for(i in 1:length(dat)){
dat[, i] <- factor(dat[, i], levels = levels(addNA(dat[, i])),
labels = c(levels(dat[, i]), "Z"),
exclude = NULL)
}
final <- cbind(final, dat)
return(final)
}
house.train <- clean(house.train)
mod <- lm(SalePrice~., data = house.train)
final.model <- step(mod, direction = "backward", trace = 0)
summary(final.model)
##
## Call:
## lm(formula = SalePrice ~ LotArea + BsmtFinSF1 + BsmtFinSF2 +
## BsmtUnfSF + X1stFlrSF + X2ndFlrSF + GarageCars + GarageArea +
## X3SsnPorch + ScreenPorch + PoolArea + MasVnrArea + MSZoning +
## Street + LandContour + Utilities + LotConfig + LandSlope +
## Neighborhood + Condition1 + Condition2 + BldgType + OverallQual +
## OverallCond + RoofMatl + Exterior1st + Foundation + HeatingQC +
## FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + KitchenQual +
## TotRmsAbvGrd + Functional + Fireplaces + SaleCondition +
## BsmtQual + BsmtExposure + BsmtFinType1 + GarageType + PoolQC,
## data = house.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -172663 -9364 15 9825 172663
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.987e+05 8.621e+04 -10.424 < 2e-16 ***
## LotArea 5.307e-01 9.205e-02 5.765 1.02e-08 ***
## BsmtFinSF1 3.871e+01 4.423e+00 8.753 < 2e-16 ***
## BsmtFinSF2 3.349e+01 5.501e+00 6.088 1.51e-09 ***
## BsmtUnfSF 1.985e+01 4.118e+00 4.821 1.60e-06 ***
## X1stFlrSF 4.184e+01 4.880e+00 8.574 < 2e-16 ***
## X2ndFlrSF 4.203e+01 3.766e+00 11.160 < 2e-16 ***
## GarageCars 3.595e+03 2.117e+03 1.698 0.089681 .
## GarageArea 2.366e+01 7.019e+00 3.371 0.000771 ***
## X3SsnPorch 5.279e+01 2.111e+01 2.501 0.012523 *
## ScreenPorch 3.684e+01 1.146e+01 3.214 0.001342 **
## PoolArea 6.852e+02 1.597e+02 4.290 1.92e-05 ***
## MasVnrArea 1.467e+01 4.305e+00 3.408 0.000675 ***
## MSZoningFV 3.296e+04 1.113e+04 2.962 0.003115 **
## MSZoningRH 3.083e+04 1.134e+04 2.719 0.006646 **
## MSZoningRL 2.996e+04 9.554e+03 3.136 0.001754 **
## MSZoningRM 2.885e+04 8.905e+03 3.240 0.001225 **
## StreetPave 1.487e+04 1.083e+04 1.374 0.169813
## LandContourHLS 7.973e+03 4.780e+03 1.668 0.095572 .
## LandContourLow -2.421e+03 5.913e+03 -0.409 0.682345
## LandContourLvl 6.755e+03 3.395e+03 1.990 0.046814 *
## UtilitiesNoSeWa -3.816e+04 2.385e+04 -1.600 0.109905
## LotConfigCulDSac 7.439e+03 2.914e+03 2.553 0.010793 *
## LotConfigFR2 -6.564e+03 3.702e+03 -1.773 0.076414 .
## LotConfigFR3 -1.643e+04 1.186e+04 -1.385 0.166191
## LotConfigInside -9.882e+02 1.632e+03 -0.606 0.544917
## LandSlopeMod 8.082e+03 3.673e+03 2.200 0.027977 *
## LandSlopeSev -2.394e+04 1.009e+04 -2.372 0.017838 *
## NeighborhoodBlueste -3.668e+03 1.761e+04 -0.208 0.834989
## NeighborhoodBrDale -6.100e+03 1.017e+04 -0.600 0.548636
## NeighborhoodBrkSide -1.047e+04 8.493e+03 -1.232 0.218099
## NeighborhoodClearCr -9.589e+03 8.488e+03 -1.130 0.258793
## NeighborhoodCollgCr -7.058e+03 6.781e+03 -1.041 0.298089
## NeighborhoodCrawfor 6.238e+03 7.838e+03 0.796 0.426289
## NeighborhoodEdwards -2.244e+04 7.378e+03 -3.042 0.002397 **
## NeighborhoodGilbert -5.543e+03 7.164e+03 -0.774 0.439220
## NeighborhoodIDOTRR -1.995e+04 9.554e+03 -2.089 0.036939 *
## NeighborhoodMeadowV -2.244e+04 1.029e+04 -2.181 0.029371 *
## NeighborhoodMitchel -1.733e+04 7.574e+03 -2.288 0.022302 *
## NeighborhoodNAmes -1.746e+04 7.240e+03 -2.412 0.016015 *
## NeighborhoodNoRidge 2.359e+04 7.910e+03 2.982 0.002919 **
## NeighborhoodNPkVill 4.619e+03 1.023e+04 0.451 0.651710
## NeighborhoodNridgHt 1.403e+04 6.839e+03 2.052 0.040375 *
## NeighborhoodNWAmes -1.556e+04 7.470e+03 -2.083 0.037424 *
## NeighborhoodOldTown -2.252e+04 8.589e+03 -2.622 0.008855 **
## NeighborhoodSawyer -1.127e+04 7.542e+03 -1.494 0.135369
## NeighborhoodSawyerW -2.423e+03 7.261e+03 -0.334 0.738641
## NeighborhoodSomerst 3.319e+03 8.193e+03 0.405 0.685473
## NeighborhoodStoneBr 2.719e+04 7.748e+03 3.510 0.000464 ***
## NeighborhoodSWISU -1.750e+04 8.727e+03 -2.005 0.045176 *
## NeighborhoodTimber -8.915e+03 7.654e+03 -1.165 0.244378
## NeighborhoodVeenker 2.182e+03 9.656e+03 0.226 0.821284
## Condition1Feedr 2.490e+03 4.533e+03 0.549 0.582816
## Condition1Norm 1.349e+04 3.747e+03 3.601 0.000329 ***
## Condition1PosA 1.323e+04 9.177e+03 1.442 0.149585
## Condition1PosN 1.294e+04 6.744e+03 1.919 0.055238 .
## Condition1RRAe -1.139e+04 8.459e+03 -1.347 0.178251
## Condition1RRAn 1.229e+04 6.270e+03 1.960 0.050224 .
## Condition1RRNe -5.547e+03 1.661e+04 -0.334 0.738418
## Condition1RRNn 1.096e+04 1.176e+04 0.932 0.351681
## Condition2Feedr 1.842e+03 2.109e+04 0.087 0.930403
## Condition2Norm -7.315e+02 1.828e+04 -0.040 0.968088
## Condition2PosA -1.268e+04 3.065e+04 -0.414 0.679072
## Condition2PosN -2.512e+05 2.608e+04 -9.635 < 2e-16 ***
## Condition2RRAe -4.807e+04 3.113e+04 -1.544 0.122753
## Condition2RRAn -3.120e+03 2.937e+04 -0.106 0.915401
## Condition2RRNn 8.658e+03 2.496e+04 0.347 0.728713
## BldgType2fmCon -4.712e+03 5.390e+03 -0.874 0.382185
## BldgTypeDuplex -4.425e+03 6.588e+03 -0.672 0.501844
## BldgTypeTwnhs -1.922e+04 5.097e+03 -3.771 0.000170 ***
## BldgTypeTwnhsE -1.561e+04 3.612e+03 -4.321 1.67e-05 ***
## OverallQual2 6.779e+04 3.027e+04 2.239 0.025311 *
## OverallQual3 4.521e+04 2.750e+04 1.644 0.100402
## OverallQual4 4.816e+04 2.742e+04 1.757 0.079231 .
## OverallQual5 4.795e+04 2.756e+04 1.740 0.082152 .
## OverallQual6 5.167e+04 2.764e+04 1.869 0.061819 .
## OverallQual7 6.002e+04 2.770e+04 2.167 0.030438 *
## OverallQual8 7.434e+04 2.779e+04 2.675 0.007572 **
## OverallQual9 1.105e+05 2.814e+04 3.926 9.11e-05 ***
## OverallQual10 1.502e+05 2.882e+04 5.213 2.17e-07 ***
## OverallCond2 -4.640e+04 4.105e+04 -1.130 0.258595
## OverallCond3 -6.996e+04 3.904e+04 -1.792 0.073384 .
## OverallCond4 -5.625e+04 3.967e+04 -1.418 0.156467
## OverallCond5 -4.810e+04 3.964e+04 -1.213 0.225248
## OverallCond6 -4.362e+04 3.963e+04 -1.101 0.271262
## OverallCond7 -3.596e+04 3.964e+04 -0.907 0.364474
## OverallCond8 -3.451e+04 3.967e+04 -0.870 0.384406
## OverallCond9 -2.768e+04 3.997e+04 -0.693 0.488742
## RoofMatlCompShg 5.863e+05 4.505e+04 13.014 < 2e-16 ***
## RoofMatlMembran 6.410e+05 5.272e+04 12.160 < 2e-16 ***
## RoofMatlMetal 6.108e+05 5.268e+04 11.595 < 2e-16 ***
## RoofMatlRoll 5.860e+05 5.097e+04 11.497 < 2e-16 ***
## RoofMatlTar&Grv 5.719e+05 4.660e+04 12.273 < 2e-16 ***
## RoofMatlWdShake 6.008e+05 4.688e+04 12.816 < 2e-16 ***
## RoofMatlWdShngl 6.292e+05 4.539e+04 13.862 < 2e-16 ***
## Exterior1stAsphShn 6.832e+03 2.399e+04 0.285 0.775836
## Exterior1stBrkComm 1.597e+04 1.875e+04 0.851 0.394734
## Exterior1stBrkFace 2.146e+04 6.689e+03 3.208 0.001371 **
## Exterior1stCBlock -8.039e+03 2.354e+04 -0.341 0.732786
## Exterior1stCemntBd 1.191e+04 7.037e+03 1.693 0.090749 .
## Exterior1stHdBoard 3.564e+03 6.133e+03 0.581 0.561320
## Exterior1stImStucc -1.311e+04 2.314e+04 -0.566 0.571234
## Exterior1stMetalSd 7.448e+03 5.981e+03 1.245 0.213252
## Exterior1stPlywood 3.322e+03 6.440e+03 0.516 0.606063
## Exterior1stStone -4.021e+03 1.851e+04 -0.217 0.828035
## Exterior1stStucco 6.734e+03 7.490e+03 0.899 0.368792
## Exterior1stVinylSd 8.953e+03 6.021e+03 1.487 0.137286
## Exterior1stWd Sdng 4.714e+03 5.937e+03 0.794 0.427373
## Exterior1stWdShing 3.496e+03 7.375e+03 0.474 0.635616
## FoundationCBlock 5.628e+03 2.749e+03 2.047 0.040815 *
## FoundationPConc 9.647e+03 2.979e+03 3.238 0.001233 **
## FoundationSlab 6.302e+03 8.665e+03 0.727 0.467191
## FoundationStone 1.253e+04 9.691e+03 1.293 0.196222
## FoundationWood -1.900e+04 1.387e+04 -1.370 0.170805
## HeatingQCFa -5.899e+03 3.859e+03 -1.529 0.126585
## HeatingQCGd -4.522e+03 1.906e+03 -2.373 0.017777 *
## HeatingQCPo 5.753e+03 2.391e+04 0.241 0.809880
## HeatingQCTA -4.639e+03 1.884e+03 -2.463 0.013923 *
## FullBath1 -8.574e+03 1.335e+04 -0.642 0.520706
## FullBath2 -7.105e+03 1.351e+04 -0.526 0.599048
## FullBath3 1.955e+04 1.447e+04 1.352 0.176728
## HalfBath1 5.103e+03 1.959e+03 2.605 0.009285 **
## HalfBath2 -1.139e+03 8.055e+03 -0.141 0.887601
## BedroomAbvGr1 1.578e+02 1.477e+04 0.011 0.991481
## BedroomAbvGr2 3.489e+03 1.455e+04 0.240 0.810585
## BedroomAbvGr3 -2.058e+02 1.469e+04 -0.014 0.988823
## BedroomAbvGr4 2.711e+03 1.490e+04 0.182 0.855654
## BedroomAbvGr5 -1.442e+04 1.598e+04 -0.903 0.366916
## BedroomAbvGr6 -1.401e+04 1.816e+04 -0.771 0.440725
## BedroomAbvGr8 -4.896e+03 3.013e+04 -0.162 0.870973
## KitchenAbvGr1 2.546e+04 2.847e+04 0.895 0.371207
## KitchenAbvGr2 8.824e+03 2.820e+04 0.313 0.754438
## KitchenAbvGr3 -8.287e+02 3.316e+04 -0.025 0.980066
## KitchenQualFa -1.956e+04 5.622e+03 -3.479 0.000520 ***
## KitchenQualGd -1.531e+04 3.376e+03 -4.536 6.28e-06 ***
## KitchenQualTA -1.753e+04 3.701e+03 -4.735 2.43e-06 ***
## TotRmsAbvGrd3 -4.297e+04 1.278e+04 -3.363 0.000794 ***
## TotRmsAbvGrd4 -4.125e+04 1.058e+04 -3.898 0.000102 ***
## TotRmsAbvGrd5 -4.247e+04 1.010e+04 -4.206 2.78e-05 ***
## TotRmsAbvGrd6 -4.085e+04 9.849e+03 -4.148 3.58e-05 ***
## TotRmsAbvGrd7 -3.947e+04 9.581e+03 -4.120 4.04e-05 ***
## TotRmsAbvGrd8 -3.988e+04 9.349e+03 -4.266 2.14e-05 ***
## TotRmsAbvGrd9 -3.923e+04 9.447e+03 -4.152 3.51e-05 ***
## TotRmsAbvGrd10 -2.660e+04 9.128e+03 -2.914 0.003635 **
## TotRmsAbvGrd11 -3.874e+04 1.044e+04 -3.711 0.000215 ***
## TotRmsAbvGrd12 NA NA NA NA
## TotRmsAbvGrd14 NA NA NA NA
## FunctionalMaj2 -1.962e+03 1.251e+04 -0.157 0.875380
## FunctionalMin1 1.136e+04 7.946e+03 1.429 0.153240
## FunctionalMin2 7.826e+03 7.889e+03 0.992 0.321328
## FunctionalMod 1.869e+03 9.282e+03 0.201 0.840464
## FunctionalSev -2.791e+04 2.657e+04 -1.050 0.293695
## FunctionalTyp 1.806e+04 6.910e+03 2.614 0.009050 **
## Fireplaces1 2.417e+03 1.606e+03 1.505 0.132501
## Fireplaces2 9.627e+03 2.826e+03 3.406 0.000678 ***
## Fireplaces3 8.913e+03 1.149e+04 0.776 0.437866
## SaleConditionAdjLand 3.033e+04 1.385e+04 2.189 0.028754 *
## SaleConditionAlloca -1.138e+03 8.136e+03 -0.140 0.888791
## SaleConditionFamily 3.604e+03 5.671e+03 0.636 0.525215
## SaleConditionNormal 8.571e+03 2.535e+03 3.381 0.000744 ***
## SaleConditionPartial 2.639e+04 3.526e+03 7.484 1.34e-13 ***
## BsmtQualFa -1.656e+04 5.671e+03 -2.920 0.003562 **
## BsmtQualGd -1.484e+04 3.149e+03 -4.713 2.70e-06 ***
## BsmtQualTA -1.718e+04 3.782e+03 -4.543 6.08e-06 ***
## BsmtQualZ 9.376e+02 2.367e+04 0.040 0.968405
## BsmtExposureGd 1.200e+04 2.838e+03 4.228 2.53e-05 ***
## BsmtExposureMn -2.297e+03 2.730e+03 -0.841 0.400278
## BsmtExposureNo -4.206e+03 1.940e+03 -2.168 0.030322 *
## BsmtExposureZ -8.382e+03 2.198e+04 -0.381 0.702991
## BsmtFinType1BLQ 9.884e+02 2.528e+03 0.391 0.695896
## BsmtFinType1GLQ 5.536e+03 2.324e+03 2.382 0.017375 *
## BsmtFinType1LwQ -3.254e+03 3.310e+03 -0.983 0.325688
## BsmtFinType1Rec -1.898e+03 2.665e+03 -0.712 0.476493
## BsmtFinType1Unf 1.655e+03 2.670e+03 0.620 0.535344
## BsmtFinType1Z NA NA NA NA
## GarageTypeAttchd 3.086e+04 1.038e+04 2.973 0.003002 **
## GarageTypeBasment 3.251e+04 1.201e+04 2.707 0.006870 **
## GarageTypeBuiltIn 3.214e+04 1.075e+04 2.990 0.002848 **
## GarageTypeCarPort 2.718e+04 1.331e+04 2.041 0.041417 *
## GarageTypeDetchd 2.932e+04 1.034e+04 2.834 0.004673 **
## GarageTypeZ 3.694e+04 1.124e+04 3.285 0.001047 **
## PoolQCFa -1.352e+05 2.597e+04 -5.205 2.26e-07 ***
## PoolQCGd -8.826e+04 3.017e+04 -2.925 0.003502 **
## PoolQCZ 2.902e+05 8.684e+04 3.342 0.000856 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21710 on 1279 degrees of freedom
## Multiple R-squared: 0.9345, Adjusted R-squared: 0.9253
## F-statistic: 101.4 on 180 and 1279 DF, p-value: < 2.2e-16
qqnorm(final.model$residuals)
qqline(final.model$residuals)
I would have liked to use this model, but due to factor differences between the test and train that I couldn’t smooth out, I was forced to focus on just the numerical data instead.
alt.mod <- lm(SalePrice~LotFrontage + LotArea + MasVnrArea + BsmtFinSF1 +
BsmtFinSF2 + TotalBsmtSF + X1stFlrSF + X2ndFlrSF +
LowQualFinSF + GrLivArea + GarageCars + GarageArea +
WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch +
ScreenPorch + PoolArea + MiscVal, data = house.train)
alt.fin <- step(alt.mod, direction = "backward", trace = 0)
summary(alt.fin)
##
## Call:
## lm(formula = SalePrice ~ MasVnrArea + BsmtFinSF1 + TotalBsmtSF +
## X1stFlrSF + X2ndFlrSF + GarageCars + WoodDeckSF + OpenPorchSF +
## EnclosedPorch + ScreenPorch + PoolArea, data = house.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -566694 -18955 -777 17424 299450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17870.659 4277.394 -4.178 3.12e-05 ***
## MasVnrArea 41.897 7.131 5.875 5.22e-09 ***
## BsmtFinSF1 15.463 2.949 5.244 1.81e-07 ***
## TotalBsmtSF 38.763 4.821 8.041 1.84e-15 ***
## X1stFlrSF 57.281 5.288 10.833 < 2e-16 ***
## X2ndFlrSF 62.770 3.017 20.805 < 2e-16 ***
## GarageCars 29113.767 1837.438 15.845 < 2e-16 ***
## WoodDeckSF 50.445 9.589 5.261 1.65e-07 ***
## OpenPorchSF 52.489 18.316 2.866 0.00422 **
## EnclosedPorch -41.824 19.092 -2.191 0.02864 *
## ScreenPorch 43.765 20.543 2.130 0.03330 *
## PoolArea -53.431 28.810 -1.855 0.06386 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42960 on 1448 degrees of freedom
## Multiple R-squared: 0.7098, Adjusted R-squared: 0.7076
## F-statistic: 322 on 11 and 1448 DF, p-value: < 2.2e-16
qqnorm(alt.fin$residuals)
qqline(alt.fin$residuals)
house.test <- read.csv("test.csv")
house.test$MSSubClass <- as.factor(house.test$MSSubClass)
house.test$OverallQual <- as.factor(house.test$OverallQual)
house.test$OverallCond <- as.factor(house.test$OverallCond)
house.test$YearBuilt <- as.factor(house.test$YearBuilt)
house.test$YearRemodAdd <- as.factor(house.test$YearRemodAdd)
house.test$BsmtFullBath <- as.factor(house.test$BsmtFullBath)
house.test$BsmtHalfBath <- as.factor(house.test$BsmtHalfBath)
house.test$FullBath <- as.factor(house.test$FullBath)
house.test$HalfBath <- as.factor(house.test$HalfBath)
house.test$BedroomAbvGr <- as.factor(house.test$BedroomAbvGr)
house.test$KitchenAbvGr <- as.factor(house.test$KitchenAbvGr)
house.test$TotRmsAbvGrd <- as.factor(house.test$TotRmsAbvGrd)
house.test$Fireplaces <- as.factor(house.test$Fireplaces)
house.test$MoSold <- as.factor(house.test$MoSold)
house.test$YrSold <- as.factor(house.test$YrSold)
house.test$GarageYrBlt <- as.factor(house.test$GarageYrBlt)
house.test <- clean(house.test)
eval <- predict(alt.fin, newdata = house.test)
eval[eval < 0] <- 0
result <- data.frame('Id' = house.test$Id, 'SalePrice' = eval)
head(result)
## Id SalePrice
## 1 1461 116307.2
## 2 1462 179398.0
## 3 1463 198198.2
## 4 1464 202048.7
## 5 1465 177967.3
## 6 1466 181959.0
# write.csv(result, file = "result.csv", row.names = FALSE)
Kaggle name: Iden Watanabe Score of Model: 0.22668