In this final exam, we delve into Computational Mathematics by participating in the House Prices: Advanced Regression Techniques competition hosted on Kaggle.com. Through this exam, we aim to showcase our understanding of various mathematical concepts, statistical methods, and their application in real-world datasets. The exam spans across different branches of mathematics including probability theory, descriptive and inferential statistics, linear algebra, calculus-based probability and statistics, and modeling through regression analysis.
## corrplot 0.92 loaded
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
houses <- fread("https://raw.githubusercontent.com/waheeb123/Data-605/main/Final%20Project/house-prices-advanced-regression-techniques/train.csv")
names(houses) <- tolower(gsub(" ", "_", names(houses)))
str(houses)## Classes 'data.table' and 'data.frame': 1460 obs. of 81 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ mssubclass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ mszoning : chr "RL" "RL" "RL" "RL" ...
## $ lotfrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ lotarea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ alley : chr NA NA NA NA ...
## $ lotshape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ landcontour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ lotconfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ landslope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ bldgtype : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ housestyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ overallqual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ overallcond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ yearbuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ yearremodadd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ roofstyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ roofmatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ masvnrtype : chr "BrkFace" "None" "BrkFace" "None" ...
## $ masvnrarea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ exterqual : chr "Gd" "TA" "Gd" "TA" ...
## $ extercond : chr "TA" "TA" "TA" "TA" ...
## $ foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ bsmtqual : chr "Gd" "Gd" "Gd" "TA" ...
## $ bsmtcond : chr "TA" "TA" "TA" "Gd" ...
## $ bsmtexposure : chr "No" "Gd" "Mn" "No" ...
## $ bsmtfintype1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ bsmtfinsf1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ bsmtfintype2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ bsmtfinsf2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ bsmtunfsf : int 150 284 434 540 490 64 317 216 952 140 ...
## $ totalbsmtsf : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ heatingqc : chr "Ex" "Ex" "Ex" "Gd" ...
## $ centralair : chr "Y" "Y" "Y" "Y" ...
## $ electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stflrsf : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ 2ndflrsf : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ lowqualfinsf : int 0 0 0 0 0 0 0 0 0 0 ...
## $ grlivarea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ bsmtfullbath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ bsmthalfbath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ fullbath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ halfbath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ bedroomabvgr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ kitchenabvgr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ kitchenqual : chr "Gd" "TA" "Gd" "Gd" ...
## $ totrmsabvgrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ fireplacequ : chr NA "TA" "TA" "Gd" ...
## $ garagetype : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ garageyrblt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ garagefinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ garagecars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ garagearea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ garagequal : chr "TA" "TA" "TA" "TA" ...
## $ garagecond : chr "TA" "TA" "TA" "TA" ...
## $ paveddrive : chr "Y" "Y" "Y" "Y" ...
## $ wooddecksf : int 0 298 0 0 192 40 255 235 90 0 ...
## $ openporchsf : int 61 0 42 35 84 30 57 204 0 4 ...
## $ enclosedporch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ 3ssnporch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ screenporch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poolarea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poolqc : chr NA NA NA NA ...
## $ fence : chr NA NA NA NA ...
## $ miscfeature : chr NA NA NA NA ...
## $ miscval : int 0 0 0 0 0 700 0 350 0 0 ...
## $ mosold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ yrsold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ saletype : chr "WD" "WD" "WD" "WD" ...
## $ salecondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ saleprice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## - attr(*, ".internal.selfref")=<externalptr>
## [1] 1460 81
| id | mssubclass | mszoning | lotfrontage | lotarea | street | alley | lotshape | landcontour | utilities | lotconfig | landslope | neighborhood | condition1 | condition2 | bldgtype | housestyle | overallqual | overallcond | yearbuilt | yearremodadd | roofstyle | roofmatl | exterior1st | exterior2nd | masvnrtype | masvnrarea | exterqual | extercond | foundation | bsmtqual | bsmtcond | bsmtexposure | bsmtfintype1 | bsmtfinsf1 | bsmtfintype2 | bsmtfinsf2 | bsmtunfsf | totalbsmtsf | heating | heatingqc | centralair | electrical | 1stflrsf | 2ndflrsf | lowqualfinsf | grlivarea | bsmtfullbath | bsmthalfbath | fullbath | halfbath | bedroomabvgr | kitchenabvgr | kitchenqual | totrmsabvgrd | functional | fireplaces | fireplacequ | garagetype | garageyrblt | garagefinish | garagecars | garagearea | garagequal | garagecond | paveddrive | wooddecksf | openporchsf | enclosedporch | 3ssnporch | screenporch | poolarea | poolqc | fence | miscfeature | miscval | mosold | yrsold | saletype | salecondition | saleprice |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 60 | RL | 65 | 8450 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NA | Attchd | 2003 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2008 | WD | Normal | 208500 |
| 2 | 20 | RL | 80 | 9600 | Pave | NA | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 5 | 2007 | WD | Normal | 181500 |
| 3 | 60 | RL | 68 | 11250 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 9 | 2008 | WD | Normal | 223500 |
| 4 | 70 | RL | 60 | 9550 | Pave | NA | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 5 | 60 | RL | 84 | 14260 | Pave | NA | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 12 | 2008 | WD | Normal | 250000 |
| 6 | 50 | RL | 85 | 14115 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | 1.5Fin | 5 | 5 | 1993 | 1995 | Gable | CompShg | VinylSd | VinylSd | None | 0 | TA | TA | Wood | Gd | TA | No | GLQ | 732 | Unf | 0 | 64 | 796 | GasA | Ex | Y | SBrkr | 796 | 566 | 0 | 1362 | 1 | 0 | 1 | 1 | 1 | 1 | TA | 5 | Typ | 0 | NA | Attchd | 1993 | Unf | 2 | 480 | TA | TA | Y | 40 | 30 | 0 | 320 | 0 | 0 | NA | MnPrv | Shed | 700 | 10 | 2009 | WD | Normal | 143000 |
Handling missing values in the berginning to perform analysis smoothly later on. Creating a dataframe for columns of the dataframe houses to remove NA’swith required values(variables). Table out the values connecting each other to analyze NA’s and remove or replace with required data. In our dataset garage condition, garage type, garage condition are interlinked, creating pairs to check this and handle the missing values.
df_na<- data.frame(colSums(is.na(houses)))
names(df_na)<- c( "na")
kable(table(houses$poolqc,houses$poolarea, useNA = 'ifany'))| 0 | 480 | 512 | 519 | 555 | 576 | 648 | 738 | |
|---|---|---|---|---|---|---|---|---|
| Ex | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| Fa | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| Gd | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| NA | 1453 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
## 1,453 NA's with poolarea 0. assuming there is no pool, replace with "None"
houses$poolqc[is.na(houses$poolqc)]<- "None"
## Using similar technique for fire places
table(houses$fireplacequ,houses$fireplaces,useNA = 'ifany')##
## 0 1 2 3
## Ex 0 19 4 1
## Fa 0 28 4 1
## Gd 0 324 54 2
## Po 0 20 0 0
## TA 0 259 53 1
## <NA> 690 0 0 0
houses$fireplacequ[is.na(houses$fireplacequ)] <- "None"
## missing values for allcolumns related to garage, assuming no garage at all
#table(houses$garagetype,houses$garageyrblt,useNA = 'ifany')
houses$garagetype[is.na(houses$garagetype)] <- "None"
houses$garageyrblt[is.na(houses$garageyrblt)] <- ""
houses$garagecond[is.na(houses$garagecond)] <- "None"
houses$garagefinish[is.na(houses$garagefinish)] <- "None"
houses$garagequal[is.na(houses$garagequal)] <- "None"
## another category
table(is.na(houses$masvnrarea),is.na(houses$masvnrtype))##
## FALSE TRUE
## FALSE 1452 0
## TRUE 0 8
houses$masvnrtype[is.na(houses$masvnrtype)] <- "None"
houses$masvnrarea[is.na(houses$masvnrarea)] <- 0
##for basement
table(houses$bsmtfintype1,houses$bsmtfintype2, useNA = 'ifany')##
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## ALQ 0 15 2 15 22 166 0
## BLQ 2 1 5 13 15 112 0
## GLQ 4 2 0 10 9 392 1
## LwQ 9 4 7 0 8 46 0
## Rec 4 11 0 8 0 110 0
## Unf 0 0 0 0 0 430 0
## <NA> 0 0 0 0 0 0 37
houses$bsmtfintype1[is.na(houses$bsmtfintype1)] <- "None"
houses$bsmtfintype2[is.na(houses$bsmtfintype2)] <- "None"
table(houses$bsmtqual,houses$bsmtcond, useNA = 'ifany')##
## Fa Gd Po TA <NA>
## Ex 0 11 0 110 0
## Fa 8 0 2 25 0
## Gd 2 36 0 580 0
## TA 35 18 0 596 0
## <NA> 0 0 0 0 37
houses$bsmtqual[is.na(houses$bsmtqual)] <- "None"
houses$bsmtcond[is.na(houses$bsmtcond)] <- "None"
houses$bsmtexposure[is.na(houses$bsmtexposure)] <- "None"
## we can replace basement exposure with "No" as it is included in the values, we assume it is different than ##having a basement with no exposure or no basement at all
### misc. columns
houses$miscfeature[is.na(houses$miscfeature)] <- "None"
houses$fence[is.na(houses$fence)] <- "None"
houses$alley[is.na(houses$alley)] <- "None"
houses$lotfrontage[is.na(houses$lotfrontage)] <- 0
houses$electrical[is.na(houses$electrical)] <- "None"
colSums(is.na(houses))## id mssubclass mszoning lotfrontage lotarea
## 0 0 0 0 0
## street alley lotshape landcontour utilities
## 0 0 0 0 0
## lotconfig landslope neighborhood condition1 condition2
## 0 0 0 0 0
## bldgtype housestyle overallqual overallcond yearbuilt
## 0 0 0 0 0
## yearremodadd roofstyle roofmatl exterior1st exterior2nd
## 0 0 0 0 0
## masvnrtype masvnrarea exterqual extercond foundation
## 0 0 0 0 0
## bsmtqual bsmtcond bsmtexposure bsmtfintype1 bsmtfinsf1
## 0 0 0 0 0
## bsmtfintype2 bsmtfinsf2 bsmtunfsf totalbsmtsf heating
## 0 0 0 0 0
## heatingqc centralair electrical 1stflrsf 2ndflrsf
## 0 0 0 0 0
## lowqualfinsf grlivarea bsmtfullbath bsmthalfbath fullbath
## 0 0 0 0 0
## halfbath bedroomabvgr kitchenabvgr kitchenqual totrmsabvgrd
## 0 0 0 0 0
## functional fireplaces fireplacequ garagetype garageyrblt
## 0 0 0 0 0
## garagefinish garagecars garagearea garagequal garagecond
## 0 0 0 0 0
## paveddrive wooddecksf openporchsf enclosedporch 3ssnporch
## 0 0 0 0 0
## screenporch poolarea poolqc fence miscfeature
## 0 0 0 0 0
## miscval mosold yrsold saletype salecondition
## 0 0 0 0 0
## saleprice
## 0
Another required check is to find unique levels of categorical variables.Categories are incorrectly entered as lower case or first letter is capitalized. e.g. column misc feature has the categoies
“None” “Shed” “Gar2” “Othr” “TenC”. We can perform a check at the variables if the data is correct. “Othr” “othr” might corresponds to same category. Random check was performed on few categorical variables. Based on the variables required for analysis, this can be performed on them.
## [1] "TA" "Gd" "Fa" "Po" "Ex"
## [1] "TA" "Gd" "Fa" "Po" "Ex"
## [1] 0 700 350 500 400 480 450 15500 1200 800 2000 600
## [13] 3500 1300 54 620 560 1400 8300 1150 2500
## [1] "Pave" "Grvl"
Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.
The variable to be selected for the prediction is lot area.Variable X will be the lot area and vriable Y is described as the sale price.
Plotting the variables.
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts.
# Extract data
X <- houses$lotfrontage
Y <- houses$saleprice
# Compute quartiles
## Third quartile of X
x <- quantile(X, 0.75, na.rm = TRUE)
## Second quartile of Y
y <- quantile(Y, 0.50, na.rm = TRUE)## [1] 0.2675471
# Interpretation of probabilities
# a. P(X>x | Y>y)
prob_a <- sum(X > x & Y > y) / sum(Y > y)
# b. P(X>x, Y>y)
prob_b <- sum(X > x & Y > y) / length(X) # Assuming total number of observations for X and Y are the same
# c. P(X<x | Y>y)
prob_c <- sum(X < x & Y > y) / sum(Y > y)
# Create a table of counts
table_X <- cut(X, breaks = c(-Inf, x, Inf), labels = c("<=3d quartile", ">3d quartile"))
table_Y <- cut(Y, breaks = c(-Inf, y, Inf), labels = c("<=2d quartile", ">2d quartile"))
table_counts <- table(table_X, table_Y)
# Add totals to the table
table_counts_with_totals <- addmargins(table_counts)
# Display results
print("Probability a:")## [1] "Probability a:"
##
##
## | x|
## |---------:|
## | 0.3337912|
## [1] "Probability b:"
##
##
## | x|
## |---------:|
## | 0.1664384|
## [1] "Probability c:"
##
##
## | x|
## |---------:|
## | 0.6510989|
## [1] "Table of counts:"
##
##
## | | <=2d quartile| >2d quartile| Sum|
## |:-------------|-------------:|------------:|----:|
## |<=3d quartile | 622| 485| 1107|
## |>3d quartile | 110| 243| 353|
## |Sum | 732| 728| 1460|
a. P(X>x | Y>y)
Probability of lot frontage exceeding x given sale price exceeds y, calculated as instances where X is in top quartile and Y is in second highest quartile divided by total Y instances in that quartile.
b. P(X>x, Y>y)
The probability indicates the likelihood of both lot frontage (X) and sale price (Y) exceeding their respective quartile values (x) and (y). It’s computed by dividing the count of observations where both X and Y exceed their quartile values (243) by the total count of observations (1460).
c. P(X<x | Y>y)
The probability signifies the chance that the lot frontage (X) is below the third quartile value (x) given that the sale price (Y) exceeds the second quartile value (y). It’s computed by dividing the count of observations where X falls into the lower or equal to the third quartile category and Y falls into the higher than the second quartile category (485) by the total count of observations where Y falls into the higher than the second quartile category (728).
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
# Calculate total count of observations
total_count <- nrow(houses)
# Calculate counts for A (observations above the third quartile for X) and B (observations above the second quartile for Y)
count_A <- sum(X > x) # Count of observations above the third quartile for X
count_B <- sum(Y > y) # Count of observations above the second quartile for Y
# Calculate probabilities
prob_A <- count_A / total_count
prob_B <- count_B / total_count
# Extract counts for A and B
table_A <- table_X == ">3d quartile"
table_B <- table_Y == ">2d quartile"
# Calculate count of observations in A and B
count_A_and_B <- sum(table_A & table_B)
# Calculate probability of A intersection B
prob_A_and_B <- count_A_and_B / total_count
# Calculate conditional probability P(A|B)
prob_A_given_B <- prob_A_and_B / prob_B
# Check if P(A|B) = P(A) * P(B)
is_independent <- prob_A_given_B == (prob_A * prob_B)
# Chi-square test for association
chisq_test <- chisq.test(table_counts)# Display results
cat("Probability of observations above the third quartile for X (A):", prob_A, "\n")## Probability of observations above the third quartile for X (A): 0.2417808
## Probability of observations above the second quartile for Y (B): 0.4986301
## Probability of observations in both A and B (A ∩ B): 0.1664384
## Conditional probability P(A|B): 0.3337912
## Is A independent of B (P(A|B) = P(A) * P(B)): FALSE
## Chi-square test for association:
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table_counts
## X-squared = 66.058, df = 1, p-value = 4.38e-16
The Chi-square test indicates a significant association between the variables A and B, rejecting the hypothesis of independence.
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Provide a 95% CI for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
## id mssubclass mszoning lotfrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 0.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 42.00
## Median : 730.5 Median : 50.0 Mode :character Median : 63.00
## Mean : 730.5 Mean : 56.9 Mean : 57.62
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 79.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## lotarea street alley lotshape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
## landcontour utilities lotconfig landslope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## neighborhood condition1 condition2 bldgtype
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## housestyle overallqual overallcond yearbuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
## yearremodadd roofstyle roofmatl exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
## exterior2nd masvnrtype masvnrarea exterqual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.1
## 3rd Qu.: 164.2
## Max. :1600.0
## extercond foundation bsmtqual bsmtcond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## bsmtexposure bsmtfintype1 bsmtfinsf1 bsmtfintype2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
## bsmtfinsf2 bsmtunfsf totalbsmtsf heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
## heatingqc centralair electrical 1stflrsf
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
## 2ndflrsf lowqualfinsf grlivarea bsmtfullbath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
## bsmthalfbath fullbath halfbath bedroomabvgr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
## kitchenabvgr kitchenqual totrmsabvgrd functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
## fireplaces fireplacequ garagetype garageyrblt
## Min. :0.000 Length:1460 Length:1460 Length:1460
## 1st Qu.:0.000 Class :character Class :character Class :character
## Median :1.000 Mode :character Mode :character Mode :character
## Mean :0.613
## 3rd Qu.:1.000
## Max. :3.000
## garagefinish garagecars garagearea garagequal
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
## garagecond paveddrive wooddecksf openporchsf
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
## enclosedporch 3ssnporch screenporch poolarea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
## poolqc fence miscfeature miscval
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
## mosold yrsold saletype salecondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
## saleprice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
# Plotting histograms for all quantitative variables
par(mfrow = c(2, 2)) # Setting up the layout for multiple plots
hist(houses$lotfrontage, main = "Histogram of Lot Frontage", xlab = "Lot Frontage")
hist(houses$saleprice, main = "Histogram of Sale Price", xlab = "Sale Price")
hist(houses$lotarea, main = "Histogram of Lot Area", xlab = "Lot Area")
hist(houses$overallqual, main = "Histogram of Overall Quality", xlab = "Overall Quality")# Scatterplot of X (lotfrontage) and Y (saleprice)
plot(houses$lotfrontage, houses$saleprice, xlab = "Lot Frontage", ylab = "Sale Price", main = "Scatterplot of Lot Frontage vs. Sale Price")
# 95% Confidence Interval for the Difference in Means
t.test(houses$lotfrontage, houses$saleprice, conf.level = 0.95)##
## Welch Two Sample t-test
##
## data: houses$lotfrontage and houses$saleprice
## t = -86.991, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -184941.9 -176785.2
## sample estimates:
## mean of x mean of y
## 57.62329 180921.19589
# Correlation Matrix
correlation_matrix <- cor(houses[, c("lotfrontage", "saleprice")])
# Test for correlation
correlation_test <- cor.test(houses$lotfrontage, houses$saleprice)
# Confidence Interval for Correlation
correlation_ci <- cor.test(houses$lotfrontage, houses$saleprice, conf.level = 0.99)
# Display results
print("Correlation Matrix:")## [1] "Correlation Matrix:"
## lotfrontage saleprice
## lotfrontage 1.0000000 0.2096239
## saleprice 0.2096239 1.0000000
## [1] "Test for Correlation:"
##
## Pearson's product-moment correlation
##
## data: houses$lotfrontage and houses$saleprice
## t = 8.1861, df = 1458, p-value = 5.824e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1600428 0.2581501
## sample estimates:
## cor
## 0.2096239
## [1] "Confidence Interval for Correlation:"
##
## Pearson's product-moment correlation
##
## data: houses$lotfrontage and houses$saleprice
## t = 8.1861, df = 1458, p-value = 5.824e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.1442822 0.2731456
## sample estimates:
## cor
## 0.2096239
95% CI for Difference in Means: Difference in means between Lot Frontage and Sale Price estimated between -$184,941.9 and -$176,785.2. Correlation: Weak positive correlation (r = 0.21) between Lot Frontage and Sale Price, statistically significant (p < 0.05).
Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct principle components analysis (research this!) and interpret. Discuss.
# Invert the correlation matrix (calculate the precision matrix)
precision_matrix <- solve(correlation_matrix)
# Multiply the correlation matrix by the precision matrix
correlation_precision <- correlation_matrix %*% precision_matrix
# Multiply the precision matrix by the correlation matrix
precision_correlation <- precision_matrix %*% correlation_matrix
# Principal Component Analysis (PCA)
pca_result <- prcomp(houses[, c("lotfrontage", "saleprice")], scale. = TRUE)
# Summary of PCA
summary(pca_result)## Importance of components:
## PC1 PC2
## Standard deviation 1.0998 0.8890
## Proportion of Variance 0.6048 0.3952
## Cumulative Proportion 0.6048 1.0000
The PCA results show that the first principal component (PC1) explains 60.48% of the total variance, while the second principal component (PC2) explains 39.52%. Together, these two components capture all the variance in the dataset, with PC1 being more influential in explaining the variability compared to PC2.
Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of l for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
# Shift the data so that the minimum value is above zero
shifted_data <- houses$lotarea - min(houses$lotarea) + 1
# Load the MASS package
library(MASS)
# Fit exponential distribution using fitdistr
fit_exponential <- fitdistr(shifted_data, "exponential")
# Get the optimal value of lambda
optimal_lambda <- fit_exponential$estimate
# Generate 1000 samples from the exponential distribution
samples <- rexp(1000, rate = optimal_lambda)# Summary of the fitted exponential distribution
print("Summary of the fitted exponential distribution:")## [1] "Summary of the fitted exponential distribution:"
## rate
## 1.084854e-04
## (2.839193e-06)
## Optimal value of lambda: 0.0001084854
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.47 2547.50 6313.56 9092.36 13137.06 55775.28
# Visualize the generated samples
hist(samples, main = "Histogram of Generated Samples from Exponential Distribution", xlab = "Sample Value", ylab = "Frequency")# Calculate the 5th and 95th percentiles using the CDF of the fitted exponential distribution
percentile_5 <- qexp(0.05, rate = optimal_lambda)
percentile_95 <- qexp(0.95, rate = optimal_lambda)
# Generate a 95% confidence interval from the empirical data assuming normality
mean_empirical <- mean(shifted_data)
sd_empirical <- sd(shifted_data)
n <- length(shifted_data)
margin_of_error <- qt(0.975, df = n-1) * sd_empirical / sqrt(n)
confidence_interval <- c(mean_empirical - margin_of_error, mean_empirical + margin_of_error)
# Provide the empirical 5th and 95th percentiles of the data
empirical_percentile_5 <- quantile(shifted_data, 0.05)
empirical_percentile_95 <- quantile(shifted_data, 0.95)
# Print the results
cat("5th Percentile using CDF:", percentile_5, "\n")## 5th Percentile using CDF: 472.8128
## 95th Percentile using CDF: 27614.15
cat("95% Confidence Interval from Empirical Data (Normality Assumption):", confidence_interval, "\n")## 95% Confidence Interval from Empirical Data (Normality Assumption): 8705.418 9730.238
## Empirical 5th Percentile: 2012.7
## Empirical 95th Percentile: 16102.15
These values offer a compact overview of the distribution characteristics and provide key insights into the dataset’s variability and central tendency.
Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Simple linear regression
# Simple linear regression model of SalePrice by GrLivArea, for later comparison to our multi-linear regression model
train.lm <- lm(Y ~ X, data=houses)
# Get summary of our model
summary(train.lm)##
## Call:
## lm(formula = Y ~ X, data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -174241 -52353 -15113 34262 551799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153238.44 3946.05 38.833 < 2e-16 ***
## X 480.41 58.69 8.186 5.82e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77700 on 1458 degrees of freedom
## Multiple R-squared: 0.04394, Adjusted R-squared: 0.04329
## F-statistic: 67.01 on 1 and 1458 DF, p-value: 5.824e-16
# Residuals scatterplot
plot(train.lm$fitted.values, train.lm$residuals, xlab="Fitted Values", ylab="Residuals")
abline(h=0, col="skyblue")The model estimates a base value of $153,238.44, with each unit increase in X contributing an additional $480.41. The model is highly significant (p < 0.001), explaining 4.39% of the variance in the data.
Multiple linear Regression
par(mfrow=c(2,2))
for (i in 2:(ncol(houses)-1)) {
if (is.numeric(houses[[i]])) {
plot(houses$saleprice ~ houses[[i]], main=names(houses)[i], xlab=names(houses)[i], ylab="SalePrice", col="black")
reg_line <- lm(saleprice ~ houses[[i]], data = houses)
abline(reg_line, col="red")
}
}# Multiple regression model- numeric fields only (35 fields)
train.lm2 <- lm(saleprice ~ lotarea + overallqual + overallcond + yearbuilt + yearremodadd + masvnrarea + bsmtfinsf1 + bsmtfinsf2 + bsmtunfsf + totalbsmtsf + X1stflrsf + X2ndflrsf + lowqualfinsf + grlivarea + bsmtfullbath + bsmthalfbath + fullbath + halfbath + bedroomabvgr + kitchenabvgr + kitchenqual + totrmsabvgrd + fireplaces + garageyrblt + garagecars + garagearea + wooddecksf + openporchsf + enclosedporch + screenporch + poolarea + miscval + mosold + yrsold, data = houses)# Refine multiple-regression model with backwards elimination process for each field with high p-values (16 fields)
train.lm2 <- update(train.lm2, .~. -BsmtHalfBath)
train.lm2 <- update(train.lm2, .~. -MoSold)
train.lm2 <- update(train.lm2, .~. -MiscVal)
train.lm2 <- update(train.lm2, .~. -GarageArea)
train.lm2 <- update(train.lm2, .~. -OpenPorchSF)
train.lm2 <- update(train.lm2, .~. -LowQualFinSF)
train.lm2 <- update(train.lm2, .~. -LotFrontage)
train.lm2 <- update(train.lm2, .~. -EnclosedPorch)
train.lm2 <- update(train.lm2, .~. -HalfBath)
train.lm2 <- update(train.lm2, .~. -YearRemodAdd)
train.lm2 <- update(train.lm2, .~. -GrLivArea) # This is my independent variable, but it has a high p-value; 0.654496
train.lm2 <- update(train.lm2, .~. -GarageYrBlt)
train.lm2 <- update(train.lm2, .~. -BsmtFinSF2)
train.lm2 <- update(train.lm2, .~. -BsmtUnfSF)
train.lm2 <- update(train.lm2, .~. -YrSold)
train.lm2 <- update(train.lm2, .~. -PoolArea)
# scatterplot of fitted vs residual values
plot(train.lm2$fitted.values, train.lm2$residuals, xlab="Fitted Values", ylab="Residuals",main = "Residuals vs Fitted")
abline(h=0, col="red")
# Residuals Histogram
hist(train.lm2$residuals, main="Multiple Linear Residuals Histogram")
# QQ Plot
qqnorm(train.lm2$residuals)
qqline(train.lm2$residuals)
# Plot
par(mfrow=c(2,2))p-value: Probability of observing the F-statistic or more extreme values under the null hypothesis of no relationship between predictors and response. A small p-value indicates strong evidence against the null hypothesis.
Now, we must predict SalePrice in test_data using our model
# Read test.csv
test_data <- read.csv("https://raw.githubusercontent.com/waheeb123/Data-605/main/Final%20Project/house-prices-advanced-regression-techniques/test_data")
# Summary
summary(test_data)## Id MSSubClass MSZoning LotFrontage
## Min. :1461 Min. : 20.00 Length:1459 Min. : 21.00
## 1st Qu.:1826 1st Qu.: 20.00 Class :character 1st Qu.: 58.00
## Median :2190 Median : 50.00 Mode :character Median : 67.00
## Mean :2190 Mean : 57.38 Mean : 68.58
## 3rd Qu.:2554 3rd Qu.: 70.00 3rd Qu.: 80.00
## Max. :2919 Max. :190.00 Max. :200.00
## NA's :227
## LotArea Street Alley LotShape
## Min. : 1470 Length:1459 Length:1459 Length:1459
## 1st Qu.: 7391 Class :character Class :character Class :character
## Median : 9399 Mode :character Mode :character Mode :character
## Mean : 9819
## 3rd Qu.:11518
## Max. :56600
##
## LandContour Utilities LotConfig LandSlope
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1459 Min. : 1.000 Min. :1.000 Min. :1879
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1953
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.079 Mean :5.554 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1459 Length:1459 Length:1459
## 1st Qu.:1963 Class :character Class :character Class :character
## Median :1992 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 100.7
## 3rd Qu.: 164.0
## Max. :1290.0
## NA's :15
## ExterCond Foundation BsmtQual BsmtCond
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 350.5 Mode :character
## Mean : 439.2
## 3rd Qu.: 753.5
## Max. :4010.0
## NA's :1
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0 Length:1459
## 1st Qu.: 0.00 1st Qu.: 219.2 1st Qu.: 784 Class :character
## Median : 0.00 Median : 460.0 Median : 988 Mode :character
## Mean : 52.62 Mean : 554.3 Mean :1046
## 3rd Qu.: 0.00 3rd Qu.: 797.8 3rd Qu.:1305
## Max. :1526.00 Max. :2140.0 Max. :5095
## NA's :1 NA's :1 NA's :1
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1459 Length:1459 Length:1459 Min. : 407.0
## Class :character Class :character Class :character 1st Qu.: 873.5
## Mode :character Mode :character Mode :character Median :1079.0
## Mean :1156.5
## 3rd Qu.:1382.5
## Max. :5095.0
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 407 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1118 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1432 Median :0.0000
## Mean : 326 Mean : 3.543 Mean :1486 Mean :0.4345
## 3rd Qu.: 676 3rd Qu.: 0.000 3rd Qu.:1721 3rd Qu.:1.0000
## Max. :1862 Max. :1064.000 Max. :5095 Max. :3.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.0652 Mean :1.571 Mean :0.3777 Mean :2.854
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.0000 Max. :4.000 Max. :2.0000 Max. :6.000
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1459 Min. : 3.000 Length:1459
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.042 Mean : 6.385
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:1459 Length:1459 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1959
## Median :0.0000 Mode :character Mode :character Median :1979
## Mean :0.5812 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :78
## GarageFinish GarageCars GarageArea GarageQual
## Length:1459 Min. :0.000 Min. : 0.0 Length:1459
## Class :character 1st Qu.:1.000 1st Qu.: 318.0 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.766 Mean : 472.8
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :5.000 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1459 Length:1459 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 28.00
## Mean : 93.17 Mean : 48.31
## 3rd Qu.: 168.00 3rd Qu.: 72.00
## Max. :1424.00 Max. :742.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 24.24 Mean : 1.794 Mean : 17.06 Mean : 1.744
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.00 Max. :360.000 Max. :576.00 Max. :800.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1459 Length:1459 Length:1459 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 58.17
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1459 Length:1459
## 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.104 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## [1] TRUE
## [1] 7000
# Missing values from each column
Missing_values2 <- colSums(is.na(test_data))
Missing_values2[Missing_values2>0]## MSZoning LotFrontage Alley Utilities Exterior1st Exterior2nd
## 4 227 1352 2 1 1
## MasVnrType MasVnrArea BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 16 15 44 45 44 42
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath
## 1 42 1 1 1 2
## BsmtHalfBath KitchenQual Functional FireplaceQu GarageType GarageYrBlt
## 2 1 2 730 76 78
## GarageFinish GarageCars GarageArea GarageQual GarageCond PoolQC
## 78 1 1 78 78 1456
## Fence MiscFeature SaleType
## 1169 1408 1
# Let's replace the missing numeric records with the average value of each column
for(i in 1:ncol(test_data)){
test_data[is.na(test_data[ ,i]), i] <- mean(test_data[ ,i], na.rm = TRUE)
}
# Now replace the missing values in the categorical variables (https://stackoverflow.com/questions/36377813/impute-most-frequent-categorical-value-in-all-columns-in-data-frame)
i2 <- !sapply(test_data, is.numeric)
# Most common value
Mode <- function(x) {
ux <- sort(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
# Replace the NAs in character columns with the most freq
test_data[i2] <- lapply(test_data[i2], function(x)
replace(x, is.na(x), Mode(x[!is.na(x)])))
# Check to see if any values are missing/NA
sum(is.na(test_data)) ## [1] 0
# Predict Sale Price for test_data from our multiple linear regression model
test_data$SalePrice <- predict(train.lm2, newdata = test_data)
# Create new dataframe of ID's and predicted SalePrice for Kaggle Submission
Kaggle <- test_data[, c('Id', 'SalePrice')]
# Export new dataframe to csv
write.csv(Kaggle, "605final.csv", row.names = FALSE)