##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu=\sigma=(N+1)/2\)
m <- (N+1)/2 # mean
sd <- (N+1)/2 # standard deviation
n <- 10000 # number of trials
Y <- rnorm(n, m, sd)
hist(Y)Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
x is median of X
y is 1st quartile of Y
## [1] 18.3506
## [1] 6.290243
\(P(X>x | X>y)=\frac{P(X>x, X>y)}{P(X>y)}\)
\(P(X>x, X>y)\) => sum of all the posibilities where X >x and X > y , divided by all the possibilities for X
\(P(X>y)\) => sum of all the possibilities where X>y, divided by all the possibilities for x
## [1] 0.6072383
This means the probability of X being greater than X’s median given X being greater than Y’s first quartile is 60%
\(P(X>x, Y>y)\) => Sum of all the possibilities where X>x and Y>y, divided by all the possibilities of X
## [1] 0.3767
This means the probability of X being greater than median of X and Y being greater than first quartile of Y is 37%
\(P(X<x, X>y)\) => Sum of all the possibilities where X < x and X > y, divided by all the possibilities of X
## [1] 0.3234
This means the probability of X being smaller than median of X and X being greater than first quartile of Y is 34%
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.
a <- sum(X>x & Y<y) # all possibilities where X >x and Y>y
b <- sum(X>x & Y>y) # all possibilities where X>x and Y<y
c <- sum(X<x & Y<y) # all possibilities where X<x and Y<y
d <- sum(X<x & Y>y) # all possibilities where X<x and Y>y
total_1 <- a+c
total_2 <- b+d
total_3 <- a+b
total_4 <- c+d
total_5 <- total_3+total_4
p_a <- a/n
p_b <- b/n
p_c <- c/n
p_d <- d/n
p_total_1 <- total_1/n
p_total_2 <- total_2/n
p_total_3 <- total_3/n
p_total_4 <- total_4/n
p_total_5 <- total_5/n
conditions_table <- matrix(c(p_a, p_b, p_total_3, p_c, p_d, p_total_4, p_total_1, p_total_2, p_total_5), nrow = 3, ncol = 3)
conditions_df <- as.data.frame(conditions_table)
names(conditions_df) <- c("X>x", "X<x", "Total")
row.names(conditions_df) <- c("Y<y", "Y>y", "Total")
conditions_df## X>x X<x Total
## Y<y 0.1233 0.1267 0.25
## Y>y 0.3767 0.3733 0.75
## Total 0.5000 0.5000 1.00
Based on the table \(P(X>x and Y>y)=0.372\) and \(P(X>x)P(Y>y)=0.5*0.75=0.375\) .
Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?
##
## Fisher's Exact Test for Count Data
##
## data: matrix
## p-value = 0.446
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.879972 1.056878
## sample estimates:
## odds ratio
## 0.9643714
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: matrix
## X-squared = 0.5808, df = 1, p-value = 0.446
We use Fisher’s Exact Test if the sample size is small. We use Chi-Squared test as it will be more accurate in this case.
Register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques
df_train <- read.csv("https://raw.githubusercontent.com/anilak1978/house-prices/master/train.csv")
df_test <- read.csv("https://raw.githubusercontent.com/anilak1978/house-prices/master/test.csv")
head(df_train)## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl NAmes Feedr
## 2 Lvl AllPub Corner Gtl NAmes Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 5 6 1961
## 2 Norm 1Fam 1Story 6 6 1958
## 3 Norm 1Fam 2Story 5 5 1997
## 4 Norm 1Fam 2Story 6 6 1998
## 5 Norm TwnhsE 1Story 8 5 1992
## 6 Norm 1Fam 2Story 6 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1961 Gable CompShg VinylSd VinylSd None
## 2 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 3 1998 Gable CompShg VinylSd VinylSd None
## 4 1998 Gable CompShg VinylSd VinylSd BrkFace
## 5 1992 Gable CompShg HdBoard HdBoard None
## 6 1994 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA CBlock TA TA No
## 2 108 TA TA CBlock TA TA No
## 3 0 TA TA PConc Gd TA No
## 4 20 TA TA PConc TA TA No
## 5 0 Gd TA PConc Gd TA No
## 6 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Rec 468 LwQ 144 270 882
## 2 ALQ 923 Unf 0 406 1329
## 3 GLQ 791 Unf 0 137 928
## 4 GLQ 602 Unf 0 324 926
## 5 ALQ 263 Unf 0 1017 1280
## 6 Unf 0 Unf 0 763 763
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA Y SBrkr 896 0 0
## 2 GasA TA Y SBrkr 1329 0 0
## 3 GasA Gd Y SBrkr 928 701 0
## 4 GasA Ex Y SBrkr 926 678 0
## 5 GasA Ex Y SBrkr 1280 0 0
## 6 GasA Gd Y SBrkr 763 892 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 896 0 0 1 0 2
## 2 1329 0 0 1 1 3
## 3 1629 0 0 2 1 3
## 4 1604 0 0 2 1 3
## 5 1280 0 0 2 0 2
## 6 1655 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 5 Typ 0 <NA>
## 2 1 Gd 6 Typ 0 <NA>
## 3 1 TA 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 5 Typ 0 <NA>
## 6 1 TA 7 Typ 1 TA
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1961 Unf 1 730 TA
## 2 Attchd 1958 Unf 1 312 TA
## 3 Attchd 1997 Fin 2 482 TA
## 4 Attchd 1998 Fin 2 470 TA
## 5 Attchd 1992 RFn 2 506 TA
## 6 Attchd 1993 Fin 2 440 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 140 0 0 0
## 2 TA Y 393 36 0 0
## 3 TA Y 212 34 0 0
## 4 TA Y 360 36 0 0
## 5 TA Y 0 82 0 0
## 6 TA Y 157 84 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 120 0 <NA> MnPrv <NA> 0 6 2010
## 2 0 0 <NA> <NA> Gar2 12500 6 2010
## 3 0 0 <NA> MnPrv <NA> 0 3 2010
## 4 0 0 <NA> <NA> <NA> 0 6 2010
## 5 144 0 <NA> <NA> <NA> 0 1 2010
## 6 0 0 <NA> <NA> <NA> 0 4 2010
## SaleType SaleCondition
## 1 WD Normal
## 2 WD Normal
## 3 WD Normal
## 4 WD Normal
## 5 WD Normal
## 6 WD Normal
MSSubClass: Identifies the type of dwelling involved in the sale.
20 1-STORY 1946 & NEWER ALL STYLES
30 1-STORY 1945 & OLDER
40 1-STORY W/FINISHED ATTIC ALL AGES
45 1-1/2 STORY - UNFINISHED ALL AGES
50 1-1/2 STORY FINISHED ALL AGES
60 2-STORY 1946 & NEWER
70 2-STORY 1945 & OLDER
75 2-1/2 STORY ALL AGES
80 SPLIT OR MULTI-LEVEL
85 SPLIT FOYER
90 DUPLEX - ALL STYLES AND AGES
120 1-STORY PUD (Planned Unit Development) - 1946 & NEWER
150 1-1/2 STORY PUD - ALL AGES
160 2-STORY PUD - 1946 & NEWER
180 PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
190 2 FAMILY CONVERSION - ALL STYLES AND AGES
MSZoning: Identifies the general zoning classification of the sale.
A Agriculture
C Commercial
FV Floating Village Residential
I Industrial
RH Residential High Density
RL Residential Low Density
RP Residential Low Density Park
RM Residential Medium Density
LotFrontage: Linear feet of street connected to property
LotArea: Lot size in square feet
Street: Type of road access to property
Grvl Gravel
Pave Paved
Alley: Type of alley access to property
Grvl Gravel
Pave Paved
NA No alley access
LotShape: General shape of property
Reg Regular
IR1 Slightly irregular
IR2 Moderately Irregular
IR3 Irregular
LandContour: Flatness of the property
Lvl Near Flat/Level
Bnk Banked - Quick and significant rise from street grade to building
HLS Hillside - Significant slope from side to side
Low Depression
Utilities: Type of utilities available
AllPub All public Utilities (E,G,W,& S)
NoSewr Electricity, Gas, and Water (Septic Tank)
NoSeWa Electricity and Gas Only
ELO Electricity only
LotConfig: Lot configuration
Inside Inside lot
Corner Corner lot
CulDSac Cul-de-sac
FR2 Frontage on 2 sides of property
FR3 Frontage on 3 sides of property
LandSlope: Slope of property
Gtl Gentle slope
Mod Moderate Slope
Sev Severe Slope
Neighborhood: Physical locations within Ames city limits
Blmngtn Bloomington Heights
Blueste Bluestem
BrDale Briardale
BrkSide Brookside
ClearCr Clear Creek
CollgCr College Creek
Crawfor Crawford
Edwards Edwards
Gilbert Gilbert
IDOTRR Iowa DOT and Rail Road
MeadowV Meadow Village
Mitchel Mitchell
Names North Ames
NoRidge Northridge
NPkVill Northpark Villa
NridgHt Northridge Heights
NWAmes Northwest Ames
OldTown Old Town
SWISU South & West of Iowa State University
Sawyer Sawyer
SawyerW Sawyer West
Somerst Somerset
StoneBr Stone Brook
Timber Timberland
Veenker Veenker
Condition1: Proximity to various conditions
Artery Adjacent to arterial street
Feedr Adjacent to feeder street
Norm Normal
RRNn Within 200' of North-South Railroad
RRAn Adjacent to North-South Railroad
PosN Near positive off-site feature--park, greenbelt, etc.
PosA Adjacent to postive off-site feature
RRNe Within 200' of East-West Railroad
RRAe Adjacent to East-West Railroad
Condition2: Proximity to various conditions (if more than one is present)
Artery Adjacent to arterial street
Feedr Adjacent to feeder street
Norm Normal
RRNn Within 200' of North-South Railroad
RRAn Adjacent to North-South Railroad
PosN Near positive off-site feature--park, greenbelt, etc.
PosA Adjacent to postive off-site feature
RRNe Within 200' of East-West Railroad
RRAe Adjacent to East-West Railroad
BldgType: Type of dwelling
1Fam Single-family Detached
2FmCon Two-family Conversion; originally built as one-family dwelling
Duplx Duplex
TwnhsE Townhouse End Unit
TwnhsI Townhouse Inside Unit
HouseStyle: Style of dwelling
1Story One story
1.5Fin One and one-half story: 2nd level finished
1.5Unf One and one-half story: 2nd level unfinished
2Story Two story
2.5Fin Two and one-half story: 2nd level finished
2.5Unf Two and one-half story: 2nd level unfinished
SFoyer Split Foyer
SLvl Split Level
OverallQual: Rates the overall material and finish of the house
10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor
OverallCond: Rates the overall condition of the house
10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor
YearBuilt: Original construction date
YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
RoofStyle: Type of roof
Flat Flat
Gable Gable
Gambrel Gabrel (Barn)
Hip Hip
Mansard Mansard
Shed Shed
RoofMatl: Roof material
ClyTile Clay or Tile
CompShg Standard (Composite) Shingle
Membran Membrane
Metal Metal
Roll Roll
Tar&Grv Gravel & Tar
WdShake Wood Shakes
WdShngl Wood Shingles
Exterior1st: Exterior covering on house
AsbShng Asbestos Shingles
AsphShn Asphalt Shingles
BrkComm Brick Common
BrkFace Brick Face
CBlock Cinder Block
CemntBd Cement Board
HdBoard Hard Board
ImStucc Imitation Stucco
MetalSd Metal Siding
Other Other
Plywood Plywood
PreCast PreCast
Stone Stone
Stucco Stucco
VinylSd Vinyl Siding
Wd Sdng Wood Siding
WdShing Wood Shingles
Exterior2nd: Exterior covering on house (if more than one material)
AsbShng Asbestos Shingles
AsphShn Asphalt Shingles
BrkComm Brick Common
BrkFace Brick Face
CBlock Cinder Block
CemntBd Cement Board
HdBoard Hard Board
ImStucc Imitation Stucco
MetalSd Metal Siding
Other Other
Plywood Plywood
PreCast PreCast
Stone Stone
Stucco Stucco
VinylSd Vinyl Siding
Wd Sdng Wood Siding
WdShing Wood Shingles
MasVnrType: Masonry veneer type
BrkCmn Brick Common
BrkFace Brick Face
CBlock Cinder Block
None None
Stone Stone
MasVnrArea: Masonry veneer area in square feet
ExterQual: Evaluates the quality of the material on the exterior
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor
ExterCond: Evaluates the present condition of the material on the exterior
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor
Foundation: Type of foundation
BrkTil Brick & Tile
CBlock Cinder Block
PConc Poured Contrete
Slab Slab
Stone Stone
Wood Wood
BsmtQual: Evaluates the height of the basement
Ex Excellent (100+ inches)
Gd Good (90-99 inches)
TA Typical (80-89 inches)
Fa Fair (70-79 inches)
Po Poor (<70 inches
NA No Basement
BsmtCond: Evaluates the general condition of the basement
Ex Excellent
Gd Good
TA Typical - slight dampness allowed
Fa Fair - dampness or some cracking or settling
Po Poor - Severe cracking, settling, or wetness
NA No Basement
BsmtExposure: Refers to walkout or garden level walls
Gd Good Exposure
Av Average Exposure (split levels or foyers typically score average or above)
Mn Mimimum Exposure
No No Exposure
NA No Basement
BsmtFinType1: Rating of basement finished area
GLQ Good Living Quarters
ALQ Average Living Quarters
BLQ Below Average Living Quarters
Rec Average Rec Room
LwQ Low Quality
Unf Unfinshed
NA No Basement
BsmtFinSF1: Type 1 finished square feet
BsmtFinType2: Rating of basement finished area (if multiple types)
GLQ Good Living Quarters
ALQ Average Living Quarters
BLQ Below Average Living Quarters
Rec Average Rec Room
LwQ Low Quality
Unf Unfinshed
NA No Basement
BsmtFinSF2: Type 2 finished square feet
BsmtUnfSF: Unfinished square feet of basement area
TotalBsmtSF: Total square feet of basement area
Heating: Type of heating
Floor Floor Furnace
GasA Gas forced warm air furnace
GasW Gas hot water or steam heat
Grav Gravity furnace
OthW Hot water or steam heat other than gas
Wall Wall furnace
HeatingQC: Heating quality and condition
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor
CentralAir: Central air conditioning
N No
Y Yes
Electrical: Electrical system
SBrkr Standard Circuit Breakers & Romex
FuseA Fuse Box over 60 AMP and all Romex wiring (Average)
FuseF 60 AMP Fuse Box and mostly Romex wiring (Fair)
FuseP 60 AMP Fuse Box and mostly knob & tube wiring (poor)
Mix Mixed
1stFlrSF: First Floor square feet
2ndFlrSF: Second floor square feet
LowQualFinSF: Low quality finished square feet (all floors)
GrLivArea: Above grade (ground) living area square feet
BsmtFullBath: Basement full bathrooms
BsmtHalfBath: Basement half bathrooms
FullBath: Full bathrooms above grade
HalfBath: Half baths above grade
Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
Kitchen: Kitchens above grade
KitchenQual: Kitchen quality
Ex Excellent
Gd Good
TA Typical/Average
Fa Fair
Po Poor
TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
Functional: Home functionality (Assume typical unless deductions are warranted)
Typ Typical Functionality
Min1 Minor Deductions 1
Min2 Minor Deductions 2
Mod Moderate Deductions
Maj1 Major Deductions 1
Maj2 Major Deductions 2
Sev Severely Damaged
Sal Salvage only
Fireplaces: Number of fireplaces
FireplaceQu: Fireplace quality
Ex Excellent - Exceptional Masonry Fireplace
Gd Good - Masonry Fireplace in main level
TA Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
Fa Fair - Prefabricated Fireplace in basement
Po Poor - Ben Franklin Stove
NA No Fireplace
GarageType: Garage location
2Types More than one type of garage
Attchd Attached to home
Basment Basement Garage
BuiltIn Built-In (Garage part of house - typically has room above garage)
CarPort Car Port
Detchd Detached from home
NA No Garage
GarageYrBlt: Year garage was built
GarageFinish: Interior finish of the garage
Fin Finished
RFn Rough Finished
Unf Unfinished
NA No Garage
GarageCars: Size of garage in car capacity
GarageArea: Size of garage in square feet
GarageQual: Garage quality
Ex Excellent
Gd Good
TA Typical/Average
Fa Fair
Po Poor
NA No Garage
GarageCond: Garage condition
Ex Excellent
Gd Good
TA Typical/Average
Fa Fair
Po Poor
NA No Garage
PavedDrive: Paved driveway
Y Paved
P Partial Pavement
N Dirt/Gravel
WoodDeckSF: Wood deck area in square feet
OpenPorchSF: Open porch area in square feet
EnclosedPorch: Enclosed porch area in square feet
3SsnPorch: Three season porch area in square feet
ScreenPorch: Screen porch area in square feet
PoolArea: Pool area in square feet
PoolQC: Pool quality
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
NA No Pool
Fence: Fence quality
GdPrv Good Privacy
MnPrv Minimum Privacy
GdWo Good Wood
MnWw Minimum Wood/Wire
NA No Fence
MiscFeature: Miscellaneous feature not covered in other categories
Elev Elevator
Gar2 2nd Garage (if not described in garage section)
Othr Other
Shed Shed (over 100 SF)
TenC Tennis Court
NA None
MiscVal: $Value of miscellaneous feature
MoSold: Month Sold (MM)
YrSold: Year Sold (YYYY)
SaleType: Type of sale
WD Warranty Deed - Conventional
CWD Warranty Deed - Cash
VWD Warranty Deed - VA Loan
New Home just constructed and sold
COD Court Officer Deed/Estate
Con Contract 15% Down payment regular terms
ConLw Contract Low Down payment and low interest
ConLI Contract Low Interest
ConLD Contract Low Down
Oth Other
SaleCondition: Condition of sale
Normal Normal Sale
Abnorml Abnormal Sale - trade, foreclosure, short sale
AdjLand Adjoining Land Purchase
Alloca Allocation - two linked properties with separate deeds, typically condo with a garage unit
Family Sale between family members
Partial Home was not completed when last assessed (associated with New Homes)
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
## 'data.frame': 1459 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.00 C (all): 10 Min. : 21.00
## 1st Qu.: 365.5 1st Qu.: 20.00 FV : 65 1st Qu.: 59.00
## Median : 730.0 Median : 50.00 RH : 16 Median : 69.00
## Mean : 730.0 Mean : 56.92 RL :1150 Mean : 70.05
## 3rd Qu.:1094.5 3rd Qu.: 70.00 RM : 218 3rd Qu.: 80.00
## Max. :1459.0 Max. :190.00 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7549 Pave:1453 Pave: 41 IR2: 41 HLS: 50
## Median : 9477 NA's:1368 IR3: 10 Low: 36
## Mean : 10517 Reg:924 Lvl:1310
## 3rd Qu.: 11603
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1458 Corner : 263 Gtl:1381 NAmes :225 Norm :1259
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards: 99 RRAn : 26
## Inside :1051 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual OverallCond
## Norm :1444 1Fam :1219 1Story :725 Min. : 1.0 Min. :1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.0 1st Qu.:5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.0 Median :5.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.1 Mean :5.575
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.0 3rd Qu.:6.000
## PosA : 1 1.5Unf : 14 Max. :10.0 Max. :9.000
## (Other): 2 (Other): 19
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1872 Min. :1950 Flat : 13 CompShg:1433 VinylSd:515
## 1st Qu.:1954 1st Qu.:1967 Gable :1140 Tar&Grv: 11 HdBoard:221
## Median :1973 Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220
## Mean :1971 Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206
## 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108
## Max. :2010 Max. :2010 Shed : 2 Membran: 1 CemntBd: 61
## (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## VinylSd:504 BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28
## HdBoard:206 None :863 Median : 0.0 Gd:487 Gd: 146
## Wd Sdng:197 Stone :128 Mean : 103.8 TA:906 Po: 1
## Plywood:142 NA's : 8 3rd Qu.: 166.0 TA:1281
## CmentBd: 60 Max. :1600.0
## (Other):136 NA's :8
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## BrkTil:146 Ex :121 Fa : 45 Av :221 ALQ :220
## CBlock:633 Fa : 35 Gd : 65 Gd :134 BLQ :147
## PConc :647 Gd :618 Po : 2 Mn :114 GLQ :418
## Slab : 24 TA :648 TA :1310 No :952 LwQ : 74
## Stone : 6 NA's: 37 NA's: 37 NA's: 38 Rec :133
## Wood : 3 Unf :430
## NA's: 37
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ : 19 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.5
## Median : 383.0 GLQ : 14 Median : 0.00 Median : 479.0
## Mean : 443.4 LwQ : 45 Mean : 46.38 Mean : 567.5
## 3rd Qu.: 712.0 Rec : 54 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :5644.0 Unf :1256 Max. :1474.00 Max. :2336.0
## NA's: 38
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0.0 Floor: 1 Ex:741 N: 95 FuseA: 94
## 1st Qu.: 795.5 GasA :1427 Fa: 49 Y:1364 FuseF: 27
## Median : 991.0 GasW : 18 Gd:240 FuseP: 3
## Mean :1057.3 Grav : 7 Po: 1 Mix : 1
## 3rd Qu.:1298.5 OthW : 2 TA:428 SBrkr:1333
## Max. :6110.0 Wall : 4 NA's : 1
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0.0 Min. : 0.000 Min. : 334
## 1st Qu.: 882 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:1129
## Median :1086 Median : 0.0 Median : 0.000 Median :1464
## Mean :1163 Mean : 347.2 Mean : 5.848 Mean :1516
## 3rd Qu.:1392 3rd Qu.: 728.0 3rd Qu.: 0.000 3rd Qu.:1778
## Max. :4692 Max. :2065.0 Max. :572.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4249 Mean :0.05757 Mean :1.565 Mean :0.3825
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :3.000 Max. :2.0000
##
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000 Maj1: 14
## 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000 Maj2: 5
## Median :3.000 Median :1.000 Gd:586 Median : 6.000 Min1: 31
## Mean :2.866 Mean :1.047 TA:734 Mean : 6.518 Min2: 34
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000 Mod : 15
## Max. :8.000 Max. :3.000 Max. :14.000 Sev : 1
## Typ :1359
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## Min. :0.0000 Ex : 24 2Types : 6 Min. :1900 Fin :351
## 1st Qu.:0.0000 Fa : 33 Attchd :869 1st Qu.:1961 RFn :422
## Median :1.0000 Gd :380 Basment: 19 Median :1980 Unf :605
## Mean :0.6134 Po : 20 BuiltIn: 88 Mean :1979 NA's: 81
## 3rd Qu.:1.0000 TA :313 CarPort: 9 3rd Qu.:2002
## Max. :3.0000 NA's:689 Detchd :387 Max. :2010
## NA's : 81 NA's :81
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2 N: 90
## 1st Qu.:1.000 1st Qu.: 336.0 Fa : 48 Fa : 35 P: 30
## Median :2.000 Median : 480.0 Gd : 14 Gd : 9 Y:1339
## Mean :1.768 Mean : 473.1 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1310 TA :1325
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.0 Median : 25.00 Median : 0.00 Median : 0.000
## Mean : 93.8 Mean : 46.65 Mean : 21.97 Mean : 3.412
## 3rd Qu.:168.0 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :857.0 Max. :547.00 Max. :552.00 Max. :508.000
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.07 Mean : 2.761 NA's:1452 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1178 NA's:1405
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1266
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.52 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129950
## Alloca : 12 Median :163000
## Family : 20 Mean :180944
## Normal :1197 3rd Qu.:214000
## Partial: 125 Max. :755000
##
We can look at LotArea, Neighborhood, SalePrice, HouseStyle and OverallCondition variables in more detail.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7549 9477 10517 11603 215245
The average Lot Area in the datasaet is 10517 square feet. Maximum lot size is 215245 square feet.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129950 163000 180944 214000 755000
The average Sale Price in the dataset is 180944. The meximum Sale Price is 755000.
Majority of the houses are 1 or 2 story style houses.
theme_set(theme_classic())
df_train$OverallCond_factor <- as.factor(as.character(df_train$OverallCond))
ggplot(df_train, aes(OverallCond, SalePrice))+
geom_boxplot(aes(fill=OverallCond_factor))Condition of the house has a poistive correlation with SalePrice.
theme_set(theme_classic())
ggplot(df_train, aes(Neighborhood, SalePrice))+
geom_boxplot(varwidth = T, fill="plum")+
coord_flip()df_train_1 <- select(df_train, Id, SalePrice, LotArea, YearBuilt, OverallCond, OverallQual)
head(df_train_1)## Id SalePrice LotArea YearBuilt OverallCond OverallQual
## 1 1 208500 8450 2003 5 7
## 2 2 181500 9600 1976 8 6
## 3 3 223500 11250 2001 5 7
## 4 4 140000 9550 1915 5 7
## 5 5 250000 14260 2000 5 8
## 6 6 143000 14115 1993 5 5
corr <- cor(df_train_1)
ggcorrplot(corr, hc.order=TRUE,
type = "lower",
lab=TRUE,
lab_size = 3,
method = "circle",
colors = c("tomato2", "white", "springgreen3"),
title = "Correlogram of House Train Data Set",
ggtheme=theme_bw())theme_set(theme_bw())
ggplot(df_train_1, aes(OverallQual, SalePrice))+
geom_jitter(width=10, size=1)\(H_{0}\) : The correlations between each pairwise set of variables is 0.
Confidence interval is 80%.
1- Look at the p-value of each variable that is in df_train_1 for correlation 2- 0.80 is the confidence level.
(a): Sale Price Vs OverallQual
##
## Pearson's product-moment correlation
##
## data: df_train_1$SalePrice and df_train_1$OverallQual
## t = 49.345, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780602 0.8032151
## sample estimates:
## cor
## 0.7909716
(b): Sale Price VS YearBuilt
##
## Pearson's product-moment correlation
##
## data: df_train_1$SalePrice and df_train_1$YearBuilt
## t = 23.414, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4980468 0.5468503
## sample estimates:
## cor
## 0.5228769
(c): Sale Price VS LotArea
##
## Pearson's product-moment correlation
##
## data: df_train_1$SalePrice and df_train_1$LotArea
## t = 10.441, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323277 0.2948047
## sample estimates:
## cor
## 0.2638429
We are 80% confident that , with low P value for LotArea, YearBuilt and OverallQuall are somehow correlated to SalePrice.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Id SalePrice LotArea YearBuilt OverallCond
## Id 1.001906559 -0.0156547 0.03327363 -0.006747185 -0.01217039
## SalePrice -0.015654697 3.0205330 -0.56954917 -0.409888594 -0.11541100
## LotArea 0.033273628 -0.5695492 1.12331500 0.157586539 0.04347331
## YearBuilt -0.006747185 -0.4098886 0.15758654 1.826006946 0.59289392
## OverallCond -0.012170394 -0.1154110 0.04347331 0.592893923 1.20121977
## OverallQual 0.039115176 -2.1053214 0.24636251 -0.683350579 -0.14277440
## OverallQual
## Id 0.03911518
## SalePrice -2.10532139
## LotArea 0.24636251
## YearBuilt -0.68335058
## OverallCond -0.14277440
## OverallQual 3.01826995
** Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.**
## Id SalePrice LotArea YearBuilt
## Id 1.000000e+00 0.000000e+00 2.602085e-18 0.000000e+00
## SalePrice 0.000000e+00 1.000000e+00 -8.326673e-17 0.000000e+00
## LotArea 0.000000e+00 8.326673e-17 1.000000e+00 0.000000e+00
## YearBuilt 0.000000e+00 2.220446e-16 -8.326673e-17 1.000000e+00
## OverallCond -8.673617e-19 -5.551115e-17 3.469447e-18 4.163336e-17
## OverallQual -6.938894e-18 4.440892e-16 -2.775558e-17 -2.220446e-16
## OverallCond OverallQual
## Id 0.000000e+00 0.000000e+00
## SalePrice 6.938894e-17 4.440892e-16
## LotArea 1.214306e-17 5.551115e-17
## YearBuilt -4.163336e-17 4.440892e-16
## OverallCond 1.000000e+00 0.000000e+00
## OverallQual 2.775558e-17 1.000000e+00
## Id SalePrice LotArea YearBuilt
## Id 1.000000e+00 1.387779e-17 6.938894e-18 3.469447e-18
## SalePrice -1.387779e-17 1.000000e+00 -1.387779e-16 4.440892e-16
## LotArea -2.602085e-18 5.551115e-17 1.000000e+00 -5.551115e-17
## YearBuilt -3.469447e-18 1.110223e-16 0.000000e+00 1.000000e+00
## OverallCond -3.469447e-18 1.387779e-17 -1.734723e-18 4.163336e-17
## OverallQual 1.387779e-17 4.440892e-16 1.110223e-16 2.220446e-16
## OverallCond OverallQual
## Id 8.673617e-19 6.938894e-18
## SalePrice -5.551115e-17 4.440892e-16
## LotArea 0.000000e+00 5.551115e-17
## YearBuilt 1.387779e-17 -1.110223e-16
## OverallCond 1.000000e+00 5.551115e-17
## OverallQual 0.000000e+00 1.000000e+00
Conduct LU decomposition on the matrix.
## $L
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.000000000 0.00000000 0.00000000 0.0000000 0.00000000 0
## [2,] -0.015624907 1.00000000 0.00000000 0.0000000 0.00000000 0
## [3,] 0.033210310 -0.18840230 1.00000000 0.0000000 0.00000000 0
## [4,] -0.006734345 -0.13574665 0.07937588 1.0000000 0.00000000 0
## [5,] -0.012147234 -0.03827488 0.02177132 0.3261873 1.00000000 0
## [6,] 0.039040742 -0.69685738 -0.14922979 -0.5424148 0.09174876 1
##
## $U
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.001907 -1.565470e-02 0.03327363 -6.747185e-03 -0.01217039
## [2,] 0.000000 3.020288e+00 -0.56902927 -4.099940e-01 -0.11560116
## [3,] 0.000000 0.000000e+00 1.01500355 8.056680e-02 0.02209797
## [4,] 0.000000 -5.551115e-17 0.00000000 1.763911e+00 0.57536545
## [5,] 0.000000 1.810703e-17 0.00000000 0.000000e+00 1.00848930
## [6,] 0.000000 -3.177137e-17 0.00000000 -1.110223e-16 0.00000000
## [,6]
## [1,] 0.03911518
## [2,] -2.10471022
## [3,] -0.15146876
## [4,] -0.95677155
## [5,] 0.09252764
## [6,] 1.00000000
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function
## rate
## 9.508211e-05
## (2.489265e-06)
Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable.
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
## 5% 95%
## 480.3672 31822.0990
Also generate a 95% confidence interval from the empirical data, assuming normality.
## [1] 10517.23
normal_dist <- rnorm(length(lot_area), mean(lot_area), sd(lot_area))
hist(normal_dist, breaks = 100)## 5% 95%
## -6455.641 26260.829
Finally, provide the empirical 5th percentile and 95th percentile of the data
## 5% 9.5%
## 3307.40 4921.53
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
## Id SalePrice LotArea YearBuilt OverallCond OverallQual
## 1 1 208500 8450 2003 5 7
## 2 2 181500 9600 1976 8 6
## 3 3 223500 11250 2001 5 7
## 4 4 140000 9550 1915 5 7
## 5 5 250000 14260 2000 5 8
## 6 6 143000 14115 1993 5 5
linear_model <- lm(df_train_1$SalePrice ~ df_train_1$LotArea + df_train_1$YearBuilt + df_train_1$OverallCond + df_train_1$OverallQual)
summary(linear_model)##
## Call:
## lm(formula = df_train_1$SalePrice ~ df_train_1$LotArea + df_train_1$YearBuilt +
## df_train_1$OverallCond + df_train_1$OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -268646 -26367 -3704 20043 393008
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.981e+05 1.033e+05 -7.727 2.03e-14 ***
## df_train_1$LotArea 1.499e+00 1.210e-01 12.392 < 2e-16 ***
## df_train_1$YearBuilt 3.570e+02 5.281e+01 6.761 1.98e-11 ***
## df_train_1$OverallCond 2.732e+03 1.178e+03 2.319 0.0206 *
## df_train_1$OverallQual 4.004e+04 1.079e+03 37.092 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45790 on 1454 degrees of freedom
## Multiple R-squared: 0.6689, Adjusted R-squared: 0.668
## F-statistic: 734.4 on 4 and 1454 DF, p-value: < 2.2e-16
sale_price <- function (LotArea, YearBuilt, OverallCond, OverallQual){
sale_price <- (-7.984e+05 + 1.500e+00*LotArea + 3.572e+02*YearBuilt + 2.736e+03 * OverallCond + 4.003e+04 * OverallQual)
return(sale_price)
}
bckwrd <- sale_price(df_train_1$LotArea, df_train_1$YearBuilt, df_train_1$OverallCond, df_train_1$OverallQual)
comparison <-data.frame(df_train_1$SalePrice, bckwrd, bckwrd - df_train_1$SalePrice)
head(comparison)## df_train_1.SalePrice bckwrd bckwrd...df_train_1.SalePrice
## 1 208500 223636.6 15136.6
## 2 181500 183895.2 2395.2
## 3 223500 227122.2 3622.2
## 4 140000 193853.0 53853.0
## 5 250000 271310.0 21310.0
## 6 143000 148502.1 5502.1
## Id LotArea YearBuilt OverallCond OverallQual
## 1 1461 11622 1961 6 5
## 2 1462 14267 1958 6 6
## 3 1463 13830 1997 5 5
## 4 1464 9978 1998 6 6
## 5 1465 5005 1992 5 8
## 6 1466 10000 1993 5 6
## [1] 35462.75
## [1] 35494.47
prediction <- predict(linear_model, df_test_1)
predict_df <- data.frame(Id=df_test_1$Id,SalePrice=prediction)
predict_df$SalePrice <- ifelse(predict_df$SalePrice < 0,0,predict_df$SalePrice)
#export to csv
#write.csv(predict_df, "prediction_results.csv", row.names = FALSE)Prediction are submitted to Kaggle for results