Introduction

In this final exam, we delve into Computational Mathematics by participating in the House Prices: Advanced Regression Techniques competition hosted on Kaggle.com. Through this exam, we aim to showcase our understanding of various mathematical concepts, statistical methods, and their application in real-world datasets. The exam spans across different branches of mathematics including probability theory, descriptive and inferential statistics, linear algebra, calculus-based probability and statistics, and modeling through regression analysis.

Installing required Libraries
## corrplot 0.92 loaded
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
Data
houses <- fread("https://raw.githubusercontent.com/waheeb123/Data-605/main/Final%20Project/house-prices-advanced-regression-techniques/train.csv")

names(houses) <- tolower(gsub(" ", "_", names(houses)))

str(houses)
## Classes 'data.table' and 'data.frame':   1460 obs. of  81 variables:
##  $ id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ mssubclass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ mszoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ lotfrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ lotarea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ alley        : chr  NA NA NA NA ...
##  $ lotshape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ landcontour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ lotconfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ landslope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ bldgtype     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ housestyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ overallqual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ overallcond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ yearbuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ yearremodadd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ roofstyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ roofmatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ masvnrtype   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ masvnrarea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ exterqual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ extercond    : chr  "TA" "TA" "TA" "TA" ...
##  $ foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ bsmtqual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ bsmtcond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ bsmtexposure : chr  "No" "Gd" "Mn" "No" ...
##  $ bsmtfintype1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ bsmtfinsf1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ bsmtfintype2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ bsmtfinsf2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ bsmtunfsf    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ totalbsmtsf  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ heatingqc    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ centralair   : chr  "Y" "Y" "Y" "Y" ...
##  $ electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stflrsf     : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ 2ndflrsf     : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ lowqualfinsf : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ grlivarea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ bsmtfullbath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ bsmthalfbath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ fullbath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ halfbath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ bedroomabvgr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ kitchenabvgr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ kitchenqual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ totrmsabvgrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ fireplacequ  : chr  NA "TA" "TA" "Gd" ...
##  $ garagetype   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ garageyrblt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ garagefinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ garagecars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ garagearea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ garagequal   : chr  "TA" "TA" "TA" "TA" ...
##  $ garagecond   : chr  "TA" "TA" "TA" "TA" ...
##  $ paveddrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ wooddecksf   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ openporchsf  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ enclosedporch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ 3ssnporch    : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ screenporch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poolarea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poolqc       : chr  NA NA NA NA ...
##  $ fence        : chr  NA NA NA NA ...
##  $ miscfeature  : chr  NA NA NA NA ...
##  $ miscval      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ mosold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ yrsold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ saletype     : chr  "WD" "WD" "WD" "WD" ...
##  $ salecondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ saleprice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  - attr(*, ".internal.selfref")=<externalptr>
dim(houses)
## [1] 1460   81
kable(head(houses))
id mssubclass mszoning lotfrontage lotarea street alley lotshape landcontour utilities lotconfig landslope neighborhood condition1 condition2 bldgtype housestyle overallqual overallcond yearbuilt yearremodadd roofstyle roofmatl exterior1st exterior2nd masvnrtype masvnrarea exterqual extercond foundation bsmtqual bsmtcond bsmtexposure bsmtfintype1 bsmtfinsf1 bsmtfintype2 bsmtfinsf2 bsmtunfsf totalbsmtsf heating heatingqc centralair electrical 1stflrsf 2ndflrsf lowqualfinsf grlivarea bsmtfullbath bsmthalfbath fullbath halfbath bedroomabvgr kitchenabvgr kitchenqual totrmsabvgrd functional fireplaces fireplacequ garagetype garageyrblt garagefinish garagecars garagearea garagequal garagecond paveddrive wooddecksf openporchsf enclosedporch 3ssnporch screenporch poolarea poolqc fence miscfeature miscval mosold yrsold saletype salecondition saleprice
1 60 RL 65 8450 Pave NA Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NA Attchd 2003 RFn 2 548 TA TA Y 0 61 0 0 0 0 NA NA NA 0 2 2008 WD Normal 208500
2 20 RL 80 9600 Pave NA Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976 RFn 2 460 TA TA Y 298 0 0 0 0 0 NA NA NA 0 5 2007 WD Normal 181500
3 60 RL 68 11250 Pave NA IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001 RFn 2 608 TA TA Y 0 42 0 0 0 0 NA NA NA 0 9 2008 WD Normal 223500
4 70 RL 60 9550 Pave NA IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998 Unf 3 642 TA TA Y 0 35 272 0 0 0 NA NA NA 0 2 2006 WD Abnorml 140000
5 60 RL 84 14260 Pave NA IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000 RFn 3 836 TA TA Y 192 84 0 0 0 0 NA NA NA 0 12 2008 WD Normal 250000
6 50 RL 85 14115 Pave NA IR1 Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam 1.5Fin 5 5 1993 1995 Gable CompShg VinylSd VinylSd None 0 TA TA Wood Gd TA No GLQ 732 Unf 0 64 796 GasA Ex Y SBrkr 796 566 0 1362 1 0 1 1 1 1 TA 5 Typ 0 NA Attchd 1993 Unf 2 480 TA TA Y 40 30 0 320 0 0 NA MnPrv Shed 700 10 2009 WD Normal 143000

Missing Values and NA’s

Handling missing values in the berginning to perform analysis smoothly later on. Creating a dataframe for columns of the dataframe houses to remove NA’swith required values(variables). Table out the values connecting each other to analyze NA’s and remove or replace with required data. In our dataset garage condition, garage type, garage condition are interlinked, creating pairs to check this and handle the missing values.

df_na<- data.frame(colSums(is.na(houses)))
names(df_na)<- c( "na")


kable(table(houses$poolqc,houses$poolarea, useNA = 'ifany'))
0 480 512 519 555 576 648 738
Ex 0 0 1 0 1 0 0 0
Fa 0 0 0 1 0 0 1 0
Gd 0 1 0 0 0 1 0 1
NA 1453 0 0 0 0 0 0 0
## 1,453 NA's with poolarea 0. assuming there is no pool, replace with "None"
houses$poolqc[is.na(houses$poolqc)]<- "None"

## Using similar technique for fire places
table(houses$fireplacequ,houses$fireplaces,useNA = 'ifany')
##       
##          0   1   2   3
##   Ex     0  19   4   1
##   Fa     0  28   4   1
##   Gd     0 324  54   2
##   Po     0  20   0   0
##   TA     0 259  53   1
##   <NA> 690   0   0   0
houses$fireplacequ[is.na(houses$fireplacequ)] <- "None"




## missing values for allcolumns related to garage, assuming no garage at all
#table(houses$garagetype,houses$garageyrblt,useNA = 'ifany')


houses$garagetype[is.na(houses$garagetype)] <- "None"
houses$garageyrblt[is.na(houses$garageyrblt)] <- ""
houses$garagecond[is.na(houses$garagecond)] <- "None"
houses$garagefinish[is.na(houses$garagefinish)] <- "None"
houses$garagequal[is.na(houses$garagequal)] <- "None"



## another category
table(is.na(houses$masvnrarea),is.na(houses$masvnrtype))
##        
##         FALSE TRUE
##   FALSE  1452    0
##   TRUE      0    8
houses$masvnrtype[is.na(houses$masvnrtype)] <- "None"
houses$masvnrarea[is.na(houses$masvnrarea)] <- 0


##for basement

table(houses$bsmtfintype1,houses$bsmtfintype2, useNA = 'ifany')
##       
##        ALQ BLQ GLQ LwQ Rec Unf <NA>
##   ALQ    0  15   2  15  22 166    0
##   BLQ    2   1   5  13  15 112    0
##   GLQ    4   2   0  10   9 392    1
##   LwQ    9   4   7   0   8  46    0
##   Rec    4  11   0   8   0 110    0
##   Unf    0   0   0   0   0 430    0
##   <NA>   0   0   0   0   0   0   37
houses$bsmtfintype1[is.na(houses$bsmtfintype1)] <- "None"
houses$bsmtfintype2[is.na(houses$bsmtfintype2)] <- "None"

table(houses$bsmtqual,houses$bsmtcond, useNA = 'ifany')
##       
##         Fa  Gd  Po  TA <NA>
##   Ex     0  11   0 110    0
##   Fa     8   0   2  25    0
##   Gd     2  36   0 580    0
##   TA    35  18   0 596    0
##   <NA>   0   0   0   0   37
houses$bsmtqual[is.na(houses$bsmtqual)] <- "None"
houses$bsmtcond[is.na(houses$bsmtcond)] <- "None"
houses$bsmtexposure[is.na(houses$bsmtexposure)] <- "None"  
## we can replace basement exposure  with "No" as it is included in the values, we assume it is different than ##having a basement with no exposure or no basement at all



### misc. columns
houses$miscfeature[is.na(houses$miscfeature)] <- "None"
houses$fence[is.na(houses$fence)] <- "None"
houses$alley[is.na(houses$alley)] <- "None"
houses$lotfrontage[is.na(houses$lotfrontage)] <- 0
houses$electrical[is.na(houses$electrical)] <- "None"




colSums(is.na(houses))
##            id    mssubclass      mszoning   lotfrontage       lotarea 
##             0             0             0             0             0 
##        street         alley      lotshape   landcontour     utilities 
##             0             0             0             0             0 
##     lotconfig     landslope  neighborhood    condition1    condition2 
##             0             0             0             0             0 
##      bldgtype    housestyle   overallqual   overallcond     yearbuilt 
##             0             0             0             0             0 
##  yearremodadd     roofstyle      roofmatl   exterior1st   exterior2nd 
##             0             0             0             0             0 
##    masvnrtype    masvnrarea     exterqual     extercond    foundation 
##             0             0             0             0             0 
##      bsmtqual      bsmtcond  bsmtexposure  bsmtfintype1    bsmtfinsf1 
##             0             0             0             0             0 
##  bsmtfintype2    bsmtfinsf2     bsmtunfsf   totalbsmtsf       heating 
##             0             0             0             0             0 
##     heatingqc    centralair    electrical      1stflrsf      2ndflrsf 
##             0             0             0             0             0 
##  lowqualfinsf     grlivarea  bsmtfullbath  bsmthalfbath      fullbath 
##             0             0             0             0             0 
##      halfbath  bedroomabvgr  kitchenabvgr   kitchenqual  totrmsabvgrd 
##             0             0             0             0             0 
##    functional    fireplaces   fireplacequ    garagetype   garageyrblt 
##             0             0             0             0             0 
##  garagefinish    garagecars    garagearea    garagequal    garagecond 
##             0             0             0             0             0 
##    paveddrive    wooddecksf   openporchsf enclosedporch     3ssnporch 
##             0             0             0             0             0 
##   screenporch      poolarea        poolqc         fence   miscfeature 
##             0             0             0             0             0 
##       miscval        mosold        yrsold      saletype salecondition 
##             0             0             0             0             0 
##     saleprice 
##             0
## no misiing values, we can move forward with the analysis.

Another required check is to find unique levels of categorical variables.Categories are incorrectly entered as lower case or first letter is capitalized. e.g. column misc feature has the categoies

“None” “Shed” “Gar2” “Othr” “TenC”. We can perform a check at the variables if the data is correct. “Othr” “othr” might corresponds to same category. Random check was performed on few categorical variables. Based on the variables required for analysis, this can be performed on them.

## performing a random check at cat. variables.

unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$miscval)
##  [1]     0   700   350   500   400   480   450 15500  1200   800  2000   600
## [13]  3500  1300    54   620   560  1400  8300  1150  2500
unique(houses$street)
## [1] "Pave" "Grvl"
Selecting Variables for Probability

Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

The variable to be selected for the prediction is lot area.Variable X will be the lot area and vriable Y is described as the sale price.

Plotting the variables.

X<- hist (houses$lotfrontage)

Y<- hist (houses$saleprice)

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts.

# Extract data
X <- houses$lotfrontage
Y <- houses$saleprice

# Compute quartiles
## Third quartile of X
x <- quantile(X, 0.75, na.rm = TRUE)

## Second quartile of Y
y <- quantile(Y, 0.50, na.rm = TRUE)
# Check to see if these variables have a positive (right) skew
skewness(houses$lotfrontage)
## [1] 0.2675471
# Interpretation of probabilities
# a. P(X>x | Y>y)
prob_a <- sum(X > x & Y > y) / sum(Y > y)

# b. P(X>x, Y>y)
prob_b <- sum(X > x & Y > y) / length(X) # Assuming total number of observations for X and Y are the same

# c. P(X<x | Y>y)
prob_c <- sum(X < x & Y > y) / sum(Y > y)

# Create a table of counts
table_X <- cut(X, breaks = c(-Inf, x, Inf), labels = c("<=3d quartile", ">3d quartile"))
table_Y <- cut(Y, breaks = c(-Inf, y, Inf), labels = c("<=2d quartile", ">2d quartile"))
table_counts <- table(table_X, table_Y)

# Add totals to the table
table_counts_with_totals <- addmargins(table_counts)

# Display results
print("Probability a:")
## [1] "Probability a:"
print(kable(prob_a))
## 
## 
## |         x|
## |---------:|
## | 0.3337912|
print("Probability b:")
## [1] "Probability b:"
print(kable(prob_b))
## 
## 
## |         x|
## |---------:|
## | 0.1664384|
print("Probability c:")
## [1] "Probability c:"
print(kable(prob_c))
## 
## 
## |         x|
## |---------:|
## | 0.6510989|
print("Table of counts:")
## [1] "Table of counts:"
print(kable(table_counts_with_totals))
## 
## 
## |              | <=2d quartile| >2d quartile|  Sum|
## |:-------------|-------------:|------------:|----:|
## |<=3d quartile |           622|          485| 1107|
## |>3d quartile  |           110|          243|  353|
## |Sum           |           732|          728| 1460|

a. P(X>x | Y>y)

Probability of lot frontage exceeding x given sale price exceeds y, calculated as instances where X is in top quartile and Y is in second highest quartile divided by total Y instances in that quartile.


b. P(X>x, Y>y)

The probability indicates the likelihood of both lot frontage (X) and sale price (Y) exceeding their respective quartile values (x) and (y). It’s computed by dividing the count of observations where both X and Y exceed their quartile values (243) by the total count of observations (1460).

c. P(X<x | Y>y)

The probability signifies the chance that the lot frontage (X) is below the third quartile value (x) given that the sale price (Y) exceeds the second quartile value (y). It’s computed by dividing the count of observations where X falls into the lower or equal to the third quartile category and Y falls into the higher than the second quartile category (485) by the total count of observations where Y falls into the higher than the second quartile category (728).

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

# Calculate total count of observations
total_count <- nrow(houses)

# Calculate counts for A (observations above the third quartile for X) and B (observations above the second quartile for Y)
count_A <- sum(X > x)  # Count of observations above the third quartile for X
count_B <- sum(Y > y)  # Count of observations above the second quartile for Y

# Calculate probabilities
prob_A <- count_A / total_count
prob_B <- count_B / total_count

# Extract counts for A and B
table_A <- table_X == ">3d quartile"
table_B <- table_Y == ">2d quartile"

# Calculate count of observations in A and B
count_A_and_B <- sum(table_A & table_B)

# Calculate probability of A intersection B
prob_A_and_B <- count_A_and_B / total_count

# Calculate conditional probability P(A|B)
prob_A_given_B <- prob_A_and_B / prob_B

# Check if P(A|B) = P(A) * P(B)
is_independent <- prob_A_given_B == (prob_A * prob_B)

# Chi-square test for association
chisq_test <- chisq.test(table_counts)
# Display results
cat("Probability of observations above the third quartile for X (A):", prob_A, "\n")
## Probability of observations above the third quartile for X (A): 0.2417808
cat("Probability of observations above the second quartile for Y (B):", prob_B, "\n")
## Probability of observations above the second quartile for Y (B): 0.4986301
cat("Probability of observations in both A and B (A ∩ B):", prob_A_and_B, "\n")
## Probability of observations in both A and B (A ∩ B): 0.1664384
cat("Conditional probability P(A|B):", prob_A_given_B, "\n")
## Conditional probability P(A|B): 0.3337912
cat("Is A independent of B (P(A|B) = P(A) * P(B)):", is_independent, "\n\n")
## Is A independent of B (P(A|B) = P(A) * P(B)): FALSE
# Chi-square test result
cat("Chi-square test for association:\n")
## Chi-square test for association:
print(chisq_test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table_counts
## X-squared = 66.058, df = 1, p-value = 4.38e-16

The Chi-square test indicates a significant association between the variables A and B, rejecting the hypothesis of independence.

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Provide a 95% CI for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

# Univariate Descriptive Statistics
summary(houses)
##        id           mssubclass      mszoning          lotfrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   :  0.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 42.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 63.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 57.62  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 79.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##     lotarea          street             alley             lotshape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##  landcontour         utilities          lotconfig          landslope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  neighborhood        condition1         condition2          bldgtype        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   housestyle         overallqual      overallcond      yearbuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##   yearremodadd   roofstyle           roofmatl         exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##  exterior2nd         masvnrtype          masvnrarea      exterqual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.1                     
##                                        3rd Qu.: 164.2                     
##                                        Max.   :1600.0                     
##   extercond          foundation          bsmtqual           bsmtcond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  bsmtexposure       bsmtfintype1         bsmtfinsf1     bsmtfintype2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##    bsmtfinsf2        bsmtunfsf       totalbsmtsf       heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##   heatingqc          centralair         electrical           1stflrsf   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##     2ndflrsf     lowqualfinsf       grlivarea     bsmtfullbath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##   bsmthalfbath        fullbath        halfbath       bedroomabvgr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##   kitchenabvgr   kitchenqual         totrmsabvgrd     functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##    fireplaces    fireplacequ         garagetype        garageyrblt       
##  Min.   :0.000   Length:1460        Length:1460        Length:1460       
##  1st Qu.:0.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.613                                                           
##  3rd Qu.:1.000                                                           
##  Max.   :3.000                                                           
##  garagefinish         garagecars      garagearea      garagequal       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##   garagecond         paveddrive          wooddecksf      openporchsf    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##  enclosedporch      3ssnporch       screenporch        poolarea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##     poolqc             fence           miscfeature           miscval        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##      mosold           yrsold       saletype         salecondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##    saleprice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000
# Plotting histograms for all quantitative variables
par(mfrow = c(2, 2)) # Setting up the layout for multiple plots
hist(houses$lotfrontage, main = "Histogram of Lot Frontage", xlab = "Lot Frontage")
hist(houses$saleprice, main = "Histogram of Sale Price", xlab = "Sale Price")
hist(houses$lotarea, main = "Histogram of Lot Area", xlab = "Lot Area")
hist(houses$overallqual, main = "Histogram of Overall Quality", xlab = "Overall Quality")

# Scatterplot of X (lotfrontage) and Y (saleprice)
plot(houses$lotfrontage, houses$saleprice, xlab = "Lot Frontage", ylab = "Sale Price", main = "Scatterplot of Lot Frontage vs. Sale Price")

# 95% Confidence Interval for the Difference in Means
t.test(houses$lotfrontage, houses$saleprice, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  houses$lotfrontage and houses$saleprice
## t = -86.991, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -184941.9 -176785.2
## sample estimates:
##    mean of x    mean of y 
##     57.62329 180921.19589
# Correlation Matrix
correlation_matrix <- cor(houses[, c("lotfrontage", "saleprice")])

# Test for correlation
correlation_test <- cor.test(houses$lotfrontage, houses$saleprice)

# Confidence Interval for Correlation
correlation_ci <- cor.test(houses$lotfrontage, houses$saleprice, conf.level = 0.99)

# Display results
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
##             lotfrontage saleprice
## lotfrontage   1.0000000 0.2096239
## saleprice     0.2096239 1.0000000
print("Test for Correlation:")
## [1] "Test for Correlation:"
print(correlation_test)
## 
##  Pearson's product-moment correlation
## 
## data:  houses$lotfrontage and houses$saleprice
## t = 8.1861, df = 1458, p-value = 5.824e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1600428 0.2581501
## sample estimates:
##       cor 
## 0.2096239
print("Confidence Interval for Correlation:")
## [1] "Confidence Interval for Correlation:"
print(correlation_ci)
## 
##  Pearson's product-moment correlation
## 
## data:  houses$lotfrontage and houses$saleprice
## t = 8.1861, df = 1458, p-value = 5.824e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.1442822 0.2731456
## sample estimates:
##       cor 
## 0.2096239

95% CI for Difference in Means: Difference in means between Lot Frontage and Sale Price estimated between -$184,941.9 and -$176,785.2. Correlation: Weak positive correlation (r = 0.21) between Lot Frontage and Sale Price, statistically significant (p < 0.05).

Linear Algebra and Correlation.

Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct principle components analysis (research this!) and interpret. Discuss.

# Invert the correlation matrix (calculate the precision matrix)
precision_matrix <- solve(correlation_matrix)

# Multiply the correlation matrix by the precision matrix
correlation_precision <- correlation_matrix %*% precision_matrix

# Multiply the precision matrix by the correlation matrix
precision_correlation <- precision_matrix %*% correlation_matrix

# Principal Component Analysis (PCA)
pca_result <- prcomp(houses[, c("lotfrontage", "saleprice")], scale. = TRUE)

# Summary of PCA
summary(pca_result)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0998 0.8890
## Proportion of Variance 0.6048 0.3952
## Cumulative Proportion  0.6048 1.0000

The PCA results show that the first principal component (PC1) explains 60.48% of the total variance, while the second principal component (PC2) explains 39.52%. Together, these two components capture all the variance in the dataset, with PC1 being more influential in explaining the variability compared to PC2.

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of l for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

# Shift the data so that the minimum value is above zero
shifted_data <- houses$lotarea - min(houses$lotarea) + 1

# Load the MASS package
library(MASS)

# Fit exponential distribution using fitdistr
fit_exponential <- fitdistr(shifted_data, "exponential")

# Get the optimal value of lambda
optimal_lambda <- fit_exponential$estimate

# Generate 1000 samples from the exponential distribution
samples <- rexp(1000, rate = optimal_lambda)
# Summary of the fitted exponential distribution
print("Summary of the fitted exponential distribution:")
## [1] "Summary of the fitted exponential distribution:"
print(fit_exponential)
##        rate    
##   1.084854e-04 
##  (2.839193e-06)
# Print the optimal value of lambda
cat("Optimal value of lambda:", optimal_lambda, "\n")
## Optimal value of lambda: 0.0001084854
# Summary statistics of the generated samples
summary(samples)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.47  2547.50  6313.56  9092.36 13137.06 55775.28
# Visualize the generated samples
hist(samples, main = "Histogram of Generated Samples from Exponential Distribution", xlab = "Sample Value", ylab = "Frequency")

# Calculate the 5th and 95th percentiles using the CDF of the fitted exponential distribution
percentile_5 <- qexp(0.05, rate = optimal_lambda)
percentile_95 <- qexp(0.95, rate = optimal_lambda)

# Generate a 95% confidence interval from the empirical data assuming normality
mean_empirical <- mean(shifted_data)
sd_empirical <- sd(shifted_data)
n <- length(shifted_data)
margin_of_error <- qt(0.975, df = n-1) * sd_empirical / sqrt(n)
confidence_interval <- c(mean_empirical - margin_of_error, mean_empirical + margin_of_error)

# Provide the empirical 5th and 95th percentiles of the data
empirical_percentile_5 <- quantile(shifted_data, 0.05)
empirical_percentile_95 <- quantile(shifted_data, 0.95)

# Print the results
cat("5th Percentile using CDF:", percentile_5, "\n")
## 5th Percentile using CDF: 472.8128
cat("95th Percentile using CDF:", percentile_95, "\n")
## 95th Percentile using CDF: 27614.15
cat("95% Confidence Interval from Empirical Data (Normality Assumption):", confidence_interval, "\n")
## 95% Confidence Interval from Empirical Data (Normality Assumption): 8705.418 9730.238
cat("Empirical 5th Percentile:", empirical_percentile_5, "\n")
## Empirical 5th Percentile: 2012.7
cat("Empirical 95th Percentile:", empirical_percentile_95, "\n")
## Empirical 95th Percentile: 16102.15

These values offer a compact overview of the distribution characteristics and provide key insights into the dataset’s variability and central tendency.

Modeling.

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Simple linear regression

# Simple linear regression model of SalePrice by GrLivArea, for later comparison to our multi-linear regression model
train.lm <- lm(Y ~ X, data=houses)

# Get summary of our model
summary(train.lm)
## 
## Call:
## lm(formula = Y ~ X, data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -174241  -52353  -15113   34262  551799 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 153238.44    3946.05  38.833  < 2e-16 ***
## X              480.41      58.69   8.186 5.82e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77700 on 1458 degrees of freedom
## Multiple R-squared:  0.04394,    Adjusted R-squared:  0.04329 
## F-statistic: 67.01 on 1 and 1458 DF,  p-value: 5.824e-16
# Residuals scatterplot
plot(train.lm$fitted.values, train.lm$residuals, xlab="Fitted Values", ylab="Residuals")
abline(h=0, col="skyblue")

# Residuals Histogram
hist(train.lm$residuals, main="Simple Linear Residuals Histogram")

# QQ Plot
qqnorm(train.lm$residuals)
qqline(train.lm$residuals)

# Plot
par(mfrow=c(2,2))
plot(train.lm)

The model estimates a base value of $153,238.44, with each unit increase in X contributing an additional $480.41. The model is highly significant (p < 0.001), explaining 4.39% of the variance in the data.

Multiple linear Regression

par(mfrow=c(2,2))

for (i in 2:(ncol(houses)-1)) { 
  if (is.numeric(houses[[i]])) {
    plot(houses$saleprice ~ houses[[i]], main=names(houses)[i], xlab=names(houses)[i], ylab="SalePrice", col="black")
    reg_line <- lm(saleprice ~ houses[[i]], data = houses)
    abline(reg_line, col="red")
  }
}

# Multiple regression model- numeric fields only (35 fields)
train.lm2 <- lm(saleprice ~ lotarea + overallqual + overallcond + yearbuilt + yearremodadd + masvnrarea + bsmtfinsf1 + bsmtfinsf2 + bsmtunfsf + totalbsmtsf + X1stflrsf + X2ndflrsf + lowqualfinsf + grlivarea + bsmtfullbath + bsmthalfbath + fullbath + halfbath + bedroomabvgr + kitchenabvgr + kitchenqual + totrmsabvgrd + fireplaces + garageyrblt + garagecars + garagearea + wooddecksf + openporchsf + enclosedporch + screenporch + poolarea + miscval + mosold + yrsold, data = houses)
# Refine multiple-regression model with backwards elimination process for each field with high p-values (16 fields)
train.lm2 <- update(train.lm2, .~. -BsmtHalfBath)
train.lm2 <- update(train.lm2, .~. -MoSold)
train.lm2 <- update(train.lm2, .~. -MiscVal)
train.lm2 <- update(train.lm2, .~. -GarageArea)
train.lm2 <- update(train.lm2, .~. -OpenPorchSF)
train.lm2 <- update(train.lm2, .~. -LowQualFinSF)
train.lm2 <- update(train.lm2, .~. -LotFrontage)
train.lm2 <- update(train.lm2, .~. -EnclosedPorch)
train.lm2 <- update(train.lm2, .~. -HalfBath)
train.lm2 <- update(train.lm2, .~. -YearRemodAdd)
train.lm2 <- update(train.lm2, .~. -GrLivArea) # This is my independent variable, but it has a high p-value; 0.654496
train.lm2 <- update(train.lm2, .~. -GarageYrBlt)
train.lm2 <- update(train.lm2, .~. -BsmtFinSF2)
train.lm2 <- update(train.lm2, .~. -BsmtUnfSF)
train.lm2 <- update(train.lm2, .~. -YrSold)
train.lm2 <- update(train.lm2, .~. -PoolArea)



# scatterplot of fitted vs residual values
plot(train.lm2$fitted.values, train.lm2$residuals, xlab="Fitted Values", ylab="Residuals",main = "Residuals vs Fitted")
abline(h=0, col="red")

# Residuals Histogram
hist(train.lm2$residuals, main="Multiple Linear Residuals Histogram")


# QQ Plot
qqnorm(train.lm2$residuals)
qqline(train.lm2$residuals)


# Plot
par(mfrow=c(2,2))

p-value: Probability of observing the F-statistic or more extreme values under the null hypothesis of no relationship between predictors and response. A small p-value indicates strong evidence against the null hypothesis.

Now, we must predict SalePrice in test_data using our model

# Read test.csv
test_data <- read.csv("https://raw.githubusercontent.com/waheeb123/Data-605/main/Final%20Project/house-prices-advanced-regression-techniques/test_data")

# Summary
summary(test_data)
##        Id         MSSubClass       MSZoning          LotFrontage    
##  Min.   :1461   Min.   : 20.00   Length:1459        Min.   : 21.00  
##  1st Qu.:1826   1st Qu.: 20.00   Class :character   1st Qu.: 58.00  
##  Median :2190   Median : 50.00   Mode  :character   Median : 67.00  
##  Mean   :2190   Mean   : 57.38                      Mean   : 68.58  
##  3rd Qu.:2554   3rd Qu.: 70.00                      3rd Qu.: 80.00  
##  Max.   :2919   Max.   :190.00                      Max.   :200.00  
##                                                     NA's   :227     
##     LotArea         Street             Alley             LotShape        
##  Min.   : 1470   Length:1459        Length:1459        Length:1459       
##  1st Qu.: 7391   Class :character   Class :character   Class :character  
##  Median : 9399   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9819                                                           
##  3rd Qu.:11518                                                           
##  Max.   :56600                                                           
##                                                                          
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1459        Min.   : 1.000   Min.   :1.000   Min.   :1879  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1953  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.079   Mean   :5.554   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1459        Length:1459        Length:1459       
##  1st Qu.:1963   Class :character   Class :character   Class :character  
##  Median :1992   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 100.7                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1290.0                     
##                                        NA's   :15                         
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 350.5   Mode  :character  
##                                        Mean   : 439.2                     
##                                        3rd Qu.: 753.5                     
##                                        Max.   :4010.0                     
##                                        NA's   :1                          
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF     Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0   Length:1459       
##  1st Qu.:   0.00   1st Qu.: 219.2   1st Qu.: 784   Class :character  
##  Median :   0.00   Median : 460.0   Median : 988   Mode  :character  
##  Mean   :  52.62   Mean   : 554.3   Mean   :1046                     
##  3rd Qu.:   0.00   3rd Qu.: 797.8   3rd Qu.:1305                     
##  Max.   :1526.00   Max.   :2140.0   Max.   :5095                     
##  NA's   :1         NA's   :1        NA's   :1                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF     
##  Length:1459        Length:1459        Length:1459        Min.   : 407.0  
##  Class :character   Class :character   Class :character   1st Qu.: 873.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :1079.0  
##                                                           Mean   :1156.5  
##                                                           3rd Qu.:1382.5  
##                                                           Max.   :5095.0  
##                                                                           
##    X2ndFlrSF     LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :   0.000   Min.   : 407   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:   0.000   1st Qu.:1118   1st Qu.:0.0000  
##  Median :   0   Median :   0.000   Median :1432   Median :0.0000  
##  Mean   : 326   Mean   :   3.543   Mean   :1486   Mean   :0.4345  
##  3rd Qu.: 676   3rd Qu.:   0.000   3rd Qu.:1721   3rd Qu.:1.0000  
##  Max.   :1862   Max.   :1064.000   Max.   :5095   Max.   :3.0000  
##                                                   NA's   :2       
##   BsmtHalfBath       FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.0652   Mean   :1.571   Mean   :0.3777   Mean   :2.854  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.0000   Max.   :4.000   Max.   :2.0000   Max.   :6.000  
##  NA's   :2                                                        
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1459        Min.   : 3.000   Length:1459       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.042                      Mean   : 6.385                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :2.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:1459        Length:1459        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1959  
##  Median :0.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5812                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :78    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1459        Min.   :0.000   Min.   :   0.0   Length:1459       
##  Class :character   1st Qu.:1.000   1st Qu.: 318.0   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.766   Mean   : 472.8                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :5.000   Max.   :1488.0                     
##                     NA's   :1       NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF       OpenPorchSF    
##  Length:1459        Length:1459        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 28.00  
##                                        Mean   :  93.17   Mean   : 48.31  
##                                        3rd Qu.: 168.00   3rd Qu.: 72.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  EnclosedPorch       X3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  24.24   Mean   :  1.794   Mean   : 17.06   Mean   :  1.744  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.00   Max.   :360.000   Max.   :576.00   Max.   :800.000  
##                                                                        
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1459        Length:1459        Length:1459        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   58.17  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1459        Length:1459       
##  1st Qu.: 4.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.104   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
## 
# Check for nulls/missing data
anyNA(test_data)
## [1] TRUE
# How many missing values?
sum(is.na(test_data))
## [1] 7000
# Missing values from each column
Missing_values2 <- colSums(is.na(test_data))
Missing_values2[Missing_values2>0]
##     MSZoning  LotFrontage        Alley    Utilities  Exterior1st  Exterior2nd 
##            4          227         1352            2            1            1 
##   MasVnrType   MasVnrArea     BsmtQual     BsmtCond BsmtExposure BsmtFinType1 
##           16           15           44           45           44           42 
##   BsmtFinSF1 BsmtFinType2   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF BsmtFullBath 
##            1           42            1            1            1            2 
## BsmtHalfBath  KitchenQual   Functional  FireplaceQu   GarageType  GarageYrBlt 
##            2            1            2          730           76           78 
## GarageFinish   GarageCars   GarageArea   GarageQual   GarageCond       PoolQC 
##           78            1            1           78           78         1456 
##        Fence  MiscFeature     SaleType 
##         1169         1408            1
# Let's replace the missing numeric records with the average value of each column
for(i in 1:ncol(test_data)){
  test_data[is.na(test_data[ ,i]), i] <- mean(test_data[ ,i], na.rm = TRUE)
}


# Now replace the missing values in the categorical variables (https://stackoverflow.com/questions/36377813/impute-most-frequent-categorical-value-in-all-columns-in-data-frame)
i2 <- !sapply(test_data, is.numeric)

# Most common value
Mode <- function(x) { 
      ux <- sort(unique(x))
      ux[which.max(tabulate(match(x, ux)))] 
}

# Replace the NAs in character columns with the most freq
test_data[i2] <- lapply(test_data[i2], function(x)
              replace(x, is.na(x), Mode(x[!is.na(x)])))

# Check to see if any values are missing/NA
sum(is.na(test_data)) 
## [1] 0
# Predict Sale Price for test_data from our multiple linear regression model
test_data$SalePrice <- predict(train.lm2, newdata = test_data)


# Create new dataframe of ID's and predicted SalePrice for Kaggle Submission
Kaggle <- test_data[, c('Id', 'SalePrice')]

# Export new dataframe to csv
write.csv(Kaggle, "605final.csv", row.names = FALSE)
kaggle submission
kaggle submission