library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(Matrix)
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following:
linktrain <- ('https://raw.githubusercontent.com/Fyoun123/602Final/master/train.csv')
train <- read.csv(linktrain, header=T, stringsAsFactors = F)
linktest <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/test.csv')
test <- read.csv(linktest, header=T, stringsAsFactors = F)
Data test
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
head(test)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl NAmes Feedr
## 2 Lvl AllPub Corner Gtl NAmes Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 5 6 1961
## 2 Norm 1Fam 1Story 6 6 1958
## 3 Norm 1Fam 2Story 5 5 1997
## 4 Norm 1Fam 2Story 6 6 1998
## 5 Norm TwnhsE 1Story 8 5 1992
## 6 Norm 1Fam 2Story 6 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1961 Gable CompShg VinylSd VinylSd None
## 2 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 3 1998 Gable CompShg VinylSd VinylSd None
## 4 1998 Gable CompShg VinylSd VinylSd BrkFace
## 5 1992 Gable CompShg HdBoard HdBoard None
## 6 1994 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA CBlock TA TA No
## 2 108 TA TA CBlock TA TA No
## 3 0 TA TA PConc Gd TA No
## 4 20 TA TA PConc TA TA No
## 5 0 Gd TA PConc Gd TA No
## 6 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Rec 468 LwQ 144 270 882
## 2 ALQ 923 Unf 0 406 1329
## 3 GLQ 791 Unf 0 137 928
## 4 GLQ 602 Unf 0 324 926
## 5 ALQ 263 Unf 0 1017 1280
## 6 Unf 0 Unf 0 763 763
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA Y SBrkr 896 0 0
## 2 GasA TA Y SBrkr 1329 0 0
## 3 GasA Gd Y SBrkr 928 701 0
## 4 GasA Ex Y SBrkr 926 678 0
## 5 GasA Ex Y SBrkr 1280 0 0
## 6 GasA Gd Y SBrkr 763 892 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 896 0 0 1 0 2
## 2 1329 0 0 1 1 3
## 3 1629 0 0 2 1 3
## 4 1604 0 0 2 1 3
## 5 1280 0 0 2 0 2
## 6 1655 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 5 Typ 0 <NA>
## 2 1 Gd 6 Typ 0 <NA>
## 3 1 TA 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 5 Typ 0 <NA>
## 6 1 TA 7 Typ 1 TA
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1961 Unf 1 730 TA
## 2 Attchd 1958 Unf 1 312 TA
## 3 Attchd 1997 Fin 2 482 TA
## 4 Attchd 1998 Fin 2 470 TA
## 5 Attchd 1992 RFn 2 506 TA
## 6 Attchd 1993 Fin 2 440 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 140 0 0 0
## 2 TA Y 393 36 0 0
## 3 TA Y 212 34 0 0
## 4 TA Y 360 36 0 0
## 5 TA Y 0 82 0 0
## 6 TA Y 157 84 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 120 0 <NA> MnPrv <NA> 0 6 2010
## 2 0 0 <NA> <NA> Gar2 12500 6 2010
## 3 0 0 <NA> MnPrv <NA> 0 3 2010
## 4 0 0 <NA> <NA> <NA> 0 6 2010
## 5 144 0 <NA> <NA> <NA> 0 1 2010
## 6 0 0 <NA> <NA> <NA> 0 4 2010
## SaleType SaleCondition
## 1 WD Normal
## 2 WD Normal
## 3 WD Normal
## 4 WD Normal
## 5 WD Normal
## 6 WD Normal
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
train_n <- dplyr::select_if(train, is.numeric)
psych::describe(train_n)
## vars n mean sd median trimmed mad
## Id 1 1460 730.50 421.61 730.5 730.50 541.15
## MSSubClass 2 1460 56.90 42.30 50.0 49.15 44.48
## LotFrontage 3 1201 70.05 24.28 69.0 68.94 16.31
## LotArea 4 1460 10516.83 9981.26 9478.5 9563.28 2962.23
## OverallQual 5 1460 6.10 1.38 6.0 6.08 1.48
## OverallCond 6 1460 5.58 1.11 5.0 5.48 0.00
## YearBuilt 7 1460 1971.27 30.20 1973.0 1974.13 37.06
## YearRemodAdd 8 1460 1984.87 20.65 1994.0 1986.37 19.27
## MasVnrArea 9 1452 103.69 181.07 0.0 63.15 0.00
## BsmtFinSF1 10 1460 443.64 456.10 383.5 386.08 568.58
## BsmtFinSF2 11 1460 46.55 161.32 0.0 1.38 0.00
## BsmtUnfSF 12 1460 567.24 441.87 477.5 519.29 426.99
## TotalBsmtSF 13 1460 1057.43 438.71 991.5 1036.70 347.67
## X1stFlrSF 14 1460 1162.63 386.59 1087.0 1129.99 347.67
## X2ndFlrSF 15 1460 346.99 436.53 0.0 285.36 0.00
## LowQualFinSF 16 1460 5.84 48.62 0.0 0.00 0.00
## GrLivArea 17 1460 1515.46 525.48 1464.0 1467.67 483.33
## BsmtFullBath 18 1460 0.43 0.52 0.0 0.39 0.00
## BsmtHalfBath 19 1460 0.06 0.24 0.0 0.00 0.00
## FullBath 20 1460 1.57 0.55 2.0 1.56 0.00
## HalfBath 21 1460 0.38 0.50 0.0 0.34 0.00
## BedroomAbvGr 22 1460 2.87 0.82 3.0 2.85 0.00
## KitchenAbvGr 23 1460 1.05 0.22 1.0 1.00 0.00
## TotRmsAbvGrd 24 1460 6.52 1.63 6.0 6.41 1.48
## Fireplaces 25 1460 0.61 0.64 1.0 0.53 1.48
## GarageYrBlt 26 1379 1978.51 24.69 1980.0 1981.07 31.13
## GarageCars 27 1460 1.77 0.75 2.0 1.77 0.00
## GarageArea 28 1460 472.98 213.80 480.0 469.81 177.91
## WoodDeckSF 29 1460 94.24 125.34 0.0 71.76 0.00
## OpenPorchSF 30 1460 46.66 66.26 25.0 33.23 37.06
## EnclosedPorch 31 1460 21.95 61.12 0.0 3.87 0.00
## X3SsnPorch 32 1460 3.41 29.32 0.0 0.00 0.00
## ScreenPorch 33 1460 15.06 55.76 0.0 0.00 0.00
## PoolArea 34 1460 2.76 40.18 0.0 0.00 0.00
## MiscVal 35 1460 43.49 496.12 0.0 0.00 0.00
## MoSold 36 1460 6.32 2.70 6.0 6.25 2.97
## YrSold 37 1460 2007.82 1.33 2008.0 2007.77 1.48
## SalePrice 38 1460 180921.20 79442.50 163000.0 170783.29 56338.80
## min max range skew kurtosis se
## Id 1 1460 1459 0.00 -1.20 11.03
## MSSubClass 20 190 170 1.40 1.56 1.11
## LotFrontage 21 313 292 2.16 17.34 0.70
## LotArea 1300 215245 213945 12.18 202.26 261.22
## OverallQual 1 10 9 0.22 0.09 0.04
## OverallCond 1 9 8 0.69 1.09 0.03
## YearBuilt 1872 2010 138 -0.61 -0.45 0.79
## YearRemodAdd 1950 2010 60 -0.50 -1.27 0.54
## MasVnrArea 0 1600 1600 2.66 10.03 4.75
## BsmtFinSF1 0 5644 5644 1.68 11.06 11.94
## BsmtFinSF2 0 1474 1474 4.25 20.01 4.22
## BsmtUnfSF 0 2336 2336 0.92 0.46 11.56
## TotalBsmtSF 0 6110 6110 1.52 13.18 11.48
## X1stFlrSF 334 4692 4358 1.37 5.71 10.12
## X2ndFlrSF 0 2065 2065 0.81 -0.56 11.42
## LowQualFinSF 0 572 572 8.99 82.83 1.27
## GrLivArea 334 5642 5308 1.36 4.86 13.75
## BsmtFullBath 0 3 3 0.59 -0.84 0.01
## BsmtHalfBath 0 2 2 4.09 16.31 0.01
## FullBath 0 3 3 0.04 -0.86 0.01
## HalfBath 0 2 2 0.67 -1.08 0.01
## BedroomAbvGr 0 8 8 0.21 2.21 0.02
## KitchenAbvGr 0 3 3 4.48 21.42 0.01
## TotRmsAbvGrd 2 14 12 0.67 0.87 0.04
## Fireplaces 0 3 3 0.65 -0.22 0.02
## GarageYrBlt 1900 2010 110 -0.65 -0.42 0.66
## GarageCars 0 4 4 -0.34 0.21 0.02
## GarageArea 0 1418 1418 0.18 0.90 5.60
## WoodDeckSF 0 857 857 1.54 2.97 3.28
## OpenPorchSF 0 547 547 2.36 8.44 1.73
## EnclosedPorch 0 552 552 3.08 10.37 1.60
## X3SsnPorch 0 508 508 10.28 123.06 0.77
## ScreenPorch 0 480 480 4.11 18.34 1.46
## PoolArea 0 738 738 14.80 222.19 1.05
## MiscVal 0 15500 15500 24.43 697.64 12.98
## MoSold 1 12 11 0.21 -0.41 0.07
## YrSold 2006 2010 4 0.10 -1.19 0.03
## SalePrice 34900 755000 720100 1.88 6.50 2079.11
X <- train$GrLivArea
psych::describe(X)
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 1515.46 525.48 1464 1467.67 483.33 334 5642 5308 1.36
## kurtosis se
## X1 4.86 13.75
Y <- train$SalePrice
psych::describe(Y)
## vars n mean sd median trimmed mad min max range
## X1 1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
## skew kurtosis se
## X1 1.88 6.5 2079.11
Z <- train$TotRmsAbvGrd
psych::describe(Z)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 6.52 1.63 6 6.41 1.48 2 14 12 0.67 0.87
## se
## X1 0.04
train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=SalePrice))+
geom_boxplot() + labs(x='Rooms AG') +
scale_y_continuous()
train %>% ggplot( aes(x=factor(GrLivArea), y=SalePrice))+
geom_boxplot() + labs(x='Rooms AG') +
scale_y_continuous()
train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=GrLivArea))+
geom_boxplot() + labs(x='Rooms AG') +
scale_y_continuous()
train %>% ggplot( aes(x=factor(GrLivArea), y=SalePrice))+
geom_point(fill="blue") + labs(x='Living Area')
train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=SalePrice))+
geom_point(fill="blue") + labs(x='Rooms AG')
train %>% ggplot( aes(x=factor(LotArea), y=SalePrice))+
geom_point(fill="blue") + labs(x='Rooms AG')
pairs(~LotArea+GrLivArea+SalePrice+TotRmsAbvGrd, data=train, main="Scatter Plot")
train %>%
dplyr::select( GrLivArea,TotRmsAbvGrd,LotArea,SalePrice) %>%
cor(use="pairwise.complete.obs") %>%
corrplot()
There is a correaltion between roomsize and salesprice which is to be expected. A similar correlation betwen living area and sales price. The correlation between Lot area and sales price is not as strong.
H0:?? = 0 No Association
H1:?? <> 0 Corr may exist
cor.test(train$TotRmsAbvGrd, train$SalePrice, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$TotRmsAbvGrd and train$SalePrice
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5092841 0.5573021
## sample estimates:
## cor
## 0.5337232
P-Value is very small. C_I = .509 and 0.557 Corr = .533
cor.test(train$GrLivArea, train$SalePrice, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
P-Value is very small. C_I = .691 and 0.724 Corr = .708
cor.test(train$LotArea, train$SalePrice, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
P-Value is very small. C_I = .232 and 0.294 Corr = .263
The P-value is nearly 0 and we can reject the null hypothesis. The relationshuo with Sales price has a correlation.
Familywise error risk is low. In statistics, family-wise error rate is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests. In this case we ran multiple models and found the P_value to be extremely low.
5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Lacorr<-cor(train[, which(names(train) %in% c("GrLivArea", "SalePrice","TotRmsAbvGrd"))])
(LAmatrix_inv <- solve(Lacorr))
## GrLivArea TotRmsAbvGrd SalePrice
## GrLivArea 4.585000 -2.8676627 -1.7185052
## TotRmsAbvGrd -2.867663 3.1918921 0.3285093
## SalePrice -1.718505 0.3285093 2.0424418
Lacorr %*% LAmatrix_inv#multiply
## GrLivArea TotRmsAbvGrd SalePrice
## GrLivArea 1.000000e+00 -1.665335e-16 0
## TotRmsAbvGrd -4.440892e-16 1.000000e+00 0
## SalePrice -4.440892e-16 -2.220446e-16 1
LAmatrix_inv %*% Lacorr
## GrLivArea TotRmsAbvGrd SalePrice
## GrLivArea 1.000000e+00 -4.440892e-16 -4.440892e-16
## TotRmsAbvGrd -1.665335e-16 1.000000e+00 -2.220446e-16
## SalePrice 0.000000e+00 0.000000e+00 1.000000e+00
lu_matrix <- expand(lu(LAmatrix_inv))
lu_matrix$L
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.0000000 . .
## [2,] -0.6254444 1.0000000 .
## [3,] -0.3748103 -0.5337232 1.0000000
lu_matrix$U
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 4.5850000 -2.8676627 -1.7185052
## [2,] . 1.3983284 -0.7463202
## [3,] . . 1.0000000
lu_matrix$L %*% lu_matrix$U
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 4.585000 -2.8676627 -1.7185052
## [2,] -2.867663 3.1918921 0.3285093
## [3,] -1.718505 0.3285093 2.0424418
5 points.
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Calc <- train$LotArea
summary(Calc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
Min. value is 1300
hist(Calc)
exponential <- fitdistr(Calc, densfun="exponential")
L <- exponential$estimate
(Exp_l <- 1 / L)
## rate
## 10516.83
exponential_sample <- rexp(1000, L) #taking 1000 samples
hist(Calc, freq = FALSE, breaks = 30)
hist(exponential_sample, freq = FALSE, breaks = 30)
# 5 and 95 percentile
qexp(c(.05, .95), rate = L)
## [1] 539.4428 31505.6013
0.05 / 2, because of two-tails
qnorm(c(.025, .975), mean=mean(Calc), sd=sd(Calc))
## [1] -9046.092 30079.748
Normal distribution would have a mean between -9046.1 and 30079.748 in 95% of the sample intervals taken.
# 5th and 95th percentiles original
quantile(Calc, c(.05, .95))
## 5% 95%
## 3311.70 17401.15
5% will be less than 3311.70 and 5% will be greater than 17401.15
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
I’m ignoring rows with non numeric values in the below model.
Model <- glm(SalePrice ~ ., data=train_n)
summary(Model)
##
## Call:
## glm(formula = SalePrice ~ ., data = train_n)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -442182 -16955 -2824 15125 318183
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.351e+05 1.701e+06 -0.197 0.843909
## Id -1.205e+00 2.658e+00 -0.453 0.650332
## MSSubClass -2.001e+02 3.451e+01 -5.797 8.84e-09 ***
## LotFrontage -1.160e+02 6.126e+01 -1.894 0.058503 .
## LotArea 5.422e-01 1.575e-01 3.442 0.000599 ***
## OverallQual 1.866e+04 1.482e+03 12.592 < 2e-16 ***
## OverallCond 5.239e+03 1.368e+03 3.830 0.000135 ***
## YearBuilt 3.164e+02 8.766e+01 3.610 0.000321 ***
## YearRemodAdd 1.194e+02 8.668e+01 1.378 0.168607
## MasVnrArea 3.141e+01 7.022e+00 4.473 8.54e-06 ***
## BsmtFinSF1 1.736e+01 5.838e+00 2.973 0.003014 **
## BsmtFinSF2 8.342e+00 8.766e+00 0.952 0.341532
## BsmtUnfSF 5.005e+00 5.277e+00 0.948 0.343173
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.597e+01 7.360e+00 6.246 6.02e-10 ***
## X2ndFlrSF 4.663e+01 6.102e+00 7.641 4.72e-14 ***
## LowQualFinSF 3.341e+01 2.794e+01 1.196 0.232009
## GrLivArea NA NA NA NA
## BsmtFullBath 9.043e+03 3.198e+03 2.828 0.004776 **
## BsmtHalfBath 2.465e+03 5.073e+03 0.486 0.627135
## FullBath 5.433e+03 3.531e+03 1.539 0.124182
## HalfBath -1.098e+03 3.321e+03 -0.331 0.740945
## BedroomAbvGr -1.022e+04 2.155e+03 -4.742 2.40e-06 ***
## KitchenAbvGr -2.202e+04 6.710e+03 -3.282 0.001063 **
## TotRmsAbvGrd 5.464e+03 1.487e+03 3.674 0.000251 ***
## Fireplaces 4.372e+03 2.189e+03 1.998 0.046020 *
## GarageYrBlt -4.728e+01 9.106e+01 -0.519 0.603742
## GarageCars 1.685e+04 3.491e+03 4.827 1.58e-06 ***
## GarageArea 6.274e+00 1.213e+01 0.517 0.605002
## WoodDeckSF 2.144e+01 1.002e+01 2.139 0.032662 *
## OpenPorchSF -2.252e+00 1.949e+01 -0.116 0.907998
## EnclosedPorch 7.295e+00 2.062e+01 0.354 0.723590
## X3SsnPorch 3.349e+01 3.758e+01 0.891 0.373163
## ScreenPorch 5.805e+01 2.041e+01 2.844 0.004532 **
## PoolArea -6.052e+01 2.990e+01 -2.024 0.043204 *
## MiscVal -3.761e+00 6.960e+00 -0.540 0.589016
## MoSold -2.217e+02 4.229e+02 -0.524 0.600188
## YrSold -2.474e+02 8.458e+02 -0.293 0.769917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1354264248)
##
## Null deviance: 7.7155e+12 on 1120 degrees of freedom
## Residual deviance: 1.4694e+12 on 1085 degrees of freedom
## (339 observations deleted due to missingness)
## AIC: 26789
##
## Number of Fisher Scoring iterations: 2
Taking the highest ranked predictors and what I deem to be a big factor to create model below.
ModelA <- glm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt, data=train_n)
summary(ModelA)
##
## Call:
## glm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt, data = train_n)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -264011 -27126 -3936 19071 392919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.757e+05 1.023e+05 -7.585 5.89e-14 ***
## MSSubClass -1.584e+02 2.840e+01 -5.576 2.93e-08 ***
## LotArea 1.402e+00 1.210e-01 11.583 < 2e-16 ***
## OverallQual 4.031e+04 1.069e+03 37.710 < 2e-16 ***
## OverallCond 2.343e+03 1.168e+03 2.006 0.045 *
## YearBuilt 3.510e+02 5.226e+01 6.717 2.65e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2052712835)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 2.9846e+12 on 1454 degrees of freedom
## AIC: 35457
##
## Number of Fisher Scoring iterations: 2
ModelB<- glm(SalePrice~OverallQual + YearBuilt + GarageCars + GrLivArea + TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea + X2ndFlrSF + YearRemodAdd, data = train_n)
summary(ModelB)
##
## Call:
## glm(formula = SalePrice ~ OverallQual + YearBuilt + GarageCars +
## GrLivArea + TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea +
## X2ndFlrSF + YearRemodAdd, data = train_n)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -497751 -18752 -808 16057 293856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.061e+06 1.202e+05 -8.828 < 2e-16 ***
## OverallQual 1.909e+04 1.185e+03 16.104 < 2e-16 ***
## YearBuilt 2.577e+02 4.723e+01 5.456 5.71e-08 ***
## GarageCars 1.248e+04 1.790e+03 6.969 4.83e-12 ***
## GrLivArea 5.486e+01 5.475e+00 10.020 < 2e-16 ***
## TotalBsmtSF 2.186e+01 4.151e+00 5.266 1.61e-07 ***
## TotRmsAbvGrd 3.185e+03 1.251e+03 2.547 0.011 *
## BedroomAbvGr -9.578e+03 1.734e+03 -5.522 3.96e-08 ***
## LotArea 6.557e-01 1.042e-01 6.293 4.12e-10 ***
## X2ndFlrSF -6.146e+00 4.809e+00 -1.278 0.201
## YearRemodAdd 2.476e+02 6.246e+01 3.965 7.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1383802299)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 2.0051e+12 on 1449 degrees of freedom
## AIC: 34886
##
## Number of Fisher Scoring iterations: 2
ModelC <- glm(SalePrice ~ LotArea + OverallQual + YearBuilt + GrLivArea + TotRmsAbvGrd, data=train_n)
summary(ModelC)
##
## Call:
## glm(formula = SalePrice ~ LotArea + OverallQual + YearBuilt +
## GrLivArea + TotRmsAbvGrd, data = train_n)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -428338 -21164 -2365 17500 289820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.052e+06 8.222e+04 -12.792 <2e-16 ***
## LotArea 9.096e-01 1.086e-01 8.373 <2e-16 ***
## OverallQual 2.559e+04 1.150e+03 22.255 <2e-16 ***
## YearBuilt 4.991e+02 4.313e+01 11.572 <2e-16 ***
## GrLivArea 5.985e+01 4.082e+00 14.663 <2e-16 ***
## TotRmsAbvGrd -1.142e+03 1.150e+03 -0.993 0.321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1584283249)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 2.3035e+12 on 1454 degrees of freedom
## AIC: 35079
##
## Number of Fisher Scoring iterations: 2
anova(ModelA,ModelB,ModelC)
## Analysis of Deviance Table
##
## Model 1: SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond +
## YearBuilt
## Model 2: SalePrice ~ OverallQual + YearBuilt + GarageCars + GrLivArea +
## TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea + X2ndFlrSF +
## YearRemodAdd
## Model 3: SalePrice ~ LotArea + OverallQual + YearBuilt + GrLivArea + TotRmsAbvGrd
## Resid. Df Resid. Dev Df Deviance
## 1 1454 2.9846e+12
## 2 1449 2.0051e+12 5 9.7951e+11
## 3 1454 2.3035e+12 -5 -2.9842e+11
Predictor <- predict(ModelC, test, type="response")
Predicted <- data.frame(test$Id, Predictor)
colnames(Predicted) <- c("Id", "SalePrice")
head(Predicted)
## Id SalePrice
## 1 1461 113326.9
## 2 1462 164600.7
## 3 1463 176029.8
## 4 1464 195978.6
## 5 1465 222537.8
## 6 1466 196555.7
write.csv(Predicted, file = "ModelC.csv", row.names = FALSE)
Model C:
Kaggle Username: