Final Project

The House prices data set from Kaggle(https://www.kaggle.com/c/house-prices-advanced-regression-techniques) competition 80 variables for training set and 81 for testing set of possible sales made in Ames, Iowa. .

The house prices data set has some missing values and we are going to replace them with zero.

#load library 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
library(corrplot)
## corrplot 0.92 loaded
library(rsample)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# Arrange plots in a grid
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
#load the data 
#load data 
train = read.csv("train.csv",stringsAsFactors = F)
test = read.csv("test.csv",stringsAsFactors = F)

#summary 
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
sum(is.na(train))
## [1] 6965
#checking value with na
colSums(is.na(train))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
# checking value with na for test 
colSums(is.na(test))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             4           227             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1352             0             0             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             1             1 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##            16            15             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            44            45            44            42             1 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            42             1             1             1             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             0             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             2             2             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             1             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             2             0           730            76            78 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            78             1             1            78            78 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1456          1169          1408 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             1             0
## fill na with zero 
train[is.na(train)] <- 0
test[is.na(test)] <- 0


dim(train)
## [1] 1460   81
dim(test)
## [1] 1459   80
# selecxt x and y \
X <- train$GrLivArea
Y <-  train$SalePrice

# Load the required library
library(ggplot2)

# Create a histogram for X (independent variable)
ggplot(data = NULL, aes(x = X)) +
  geom_histogram(binwidth = 50, fill = "lightblue", color = "black") +
  labs(title = "Histogram of GrLivArea (Independent Variable)",
       x = "GrLivArea",
       y = "Frequency") +
  theme_minimal()

# Create a density plot for Y (dependent variable)
ggplot(data = NULL, aes(x = Y)) +
  geom_density(fill = "lightgreen", color = "black") +
  labs(title = "Density Plot of SalePrice (Dependent Variable)",
       x = "SalePrice",
       y = "Density") +
  theme_minimal()

Probaility

Lets Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below. a. $ P(X>x | Y>y)$ b. $ P(X>x, Y>y)$ c. $ P(X<x | Y>y) $

# Calculate the 2nd quartile of X and Y
x_quartile2 <- quantile(X, probs = 0.50, na.rm = TRUE)
y_quartile2 <- quantile(Y, probs = 0.50, na.rm = TRUE)

# Calculate the 3rd quartile of X
x_quartile3 <- quantile(X, probs = 0.75, na.rm = TRUE)

# Categorize observations based on quartiles
x_leq_2d_quartile <- sum(X <= x_quartile2)
x_gt_2d_quartile <- sum(X > x_quartile2)
x_leq_3d_quartile <- sum(X <= x_quartile3)
x_gt_3d_quartile <- sum(X > x_quartile3)

# Calculate counts for each category
leq_2d_quartile_leq_3d_quartile <- sum(X <= x_quartile2 & Y <= y_quartile2)
leq_2d_quartile_gt_3d_quartile <- sum(X <= x_quartile2 & Y > y_quartile2)
gt_2d_quartile_leq_3d_quartile <- sum(X > x_quartile2 & Y <= y_quartile2)
gt_2d_quartile_gt_3d_quartile <- sum(X > x_quartile2 & Y > y_quartile2)

# Calculate totals
total_leq_3d_quartile <- x_leq_3d_quartile
total_gt_3d_quartile <- x_gt_3d_quartile
total_leq_2d_quartile <- sum(leq_2d_quartile_leq_3d_quartile, leq_2d_quartile_gt_3d_quartile)
total_gt_2d_quartile <- sum(gt_2d_quartile_leq_3d_quartile, gt_2d_quartile_gt_3d_quartile)
total <- total_leq_3d_quartile + total_gt_3d_quartile

# Fill out the table
table_counts <- matrix(c(
  leq_2d_quartile_leq_3d_quartile, leq_2d_quartile_gt_3d_quartile, total_leq_2d_quartile,
  gt_2d_quartile_leq_3d_quartile, gt_2d_quartile_gt_3d_quartile, total_gt_2d_quartile,
  total_leq_3d_quartile, total_gt_3d_quartile, total
), nrow = 3, byrow = TRUE)

# Assign column and row names
colnames(table_counts) <- c("<=2d quartile", ">2d quartile", "Total")
rownames(table_counts) <- c("<=3d quartile", ">3d quartile", "Total")

# Print the table
print(table_counts)
##               <=2d quartile >2d quartile Total
## <=3d quartile           577          154   731
## >3d quartile            155          574   729
## Total                  1095          365  1460

Definition GrLivArea: Above Ground Living Area, measured in square feet. It encompasses the living area that is not in the basement.

Properties with above ground living area that are inferior or equal to 3rd quartile and salesprices 2nd quartile (577 houses) represent houses that don’t have a living basement. Those houses are placed in a more affordable area.

Properties with above ground living area that are inferior or equal to 3rd quartile and salesprices 2nd quartile (154 houses) are most likely to located in low price or average price Areas based on the detail information available in the data set.

Properties with above ground living area that are superior to 3rd quartile and salesprices 2nd quartile (155 houses) are most like to located in a upcoming neiborhoods where sale prices are increasing.

Properties with above ground living area that are superior to 3rd quartile and salesprices 2nd quartile (574 houses) are most like to the most expensive area to live in because they offer more living space and they are not counted toward the house measurements.

# probability for a b c 
cat('a) P(X>x|Y>y) is ',(gt_2d_quartile_gt_3d_quartile/total)/(total_gt_3d_quartile/total),'\n')
## a) P(X>x|Y>y) is  1.572603
cat('b) P(X>x|Y>y) is ',(gt_2d_quartile_gt_3d_quartile/total),'\n')
## b) P(X>x|Y>y) is  0.3931507
cat('c) P(X<x|Y>y) is ',(leq_2d_quartile_gt_3d_quartile/total)/(total_gt_3d_quartile/total),'\n')
## c) P(X<x|Y>y) is  0.4219178
# Calculate probabilities
P_A_given_B <- table_counts[2, 3] / table_counts[3, 2]
P_A <- table_counts[3, 2] / table_counts[3, 3]
P_B <- (table_counts[2, 2] + table_counts[2, 3]) / table_counts[3, 3]

# Check if P(A|B) = P(A)P(B)
P_A_times_P_B <- P_A * P_B
is_independent <- round(P_A_given_B, 6) == round(P_A_times_P_B, 6)

# Print probabilities and whether variables A and B are independent
cat("P(A|B) =", P_A_given_B, "\n")
## P(A|B) = 1.99726
cat("P(A) =", P_A, "\n")
## P(A) = 0.25
cat("P(B) =", P_B, "\n")
## P(B) = 0.8924658
cat("Is P(A|B) equal to P(A)P(B)?", is_independent, "\n")
## Is P(A|B) equal to P(A)P(B)? FALSE
condition_X_greater_x <- train$GrLivArea > x_quartile2
condition_Y_greater_y <- train$SalePrice > y_quartile2
table_A_B <- table(condition_X_greater_x, condition_Y_greater_y)
chisq.test(table_A_B)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table_A_B
## X-squared = 483.29, df = 1, p-value < 2.2e-16

Chi-Square shows that there is a strong relationship existed between GrLivArea and SalesPrice. The Chi Square yield a value of 483.29 and p-value that is approximately equal to 4.118547e-107 which less than p-value < 2.2e-16. We reject the null hypothesis.

Descriptive and Inferential Statistics

# summary of train data set and x and y 
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   :  0.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 42.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 63.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 57.62  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 79.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.1                     
##                                        3rd Qu.: 164.2                     
##                                        Max.   :1600.0                     
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :   0  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1958  
##  Median :1.000   Mode  :character   Mode  :character   Median :1977  
##  Mean   :0.613                                         Mean   :1869  
##  3rd Qu.:1.000                                         3rd Qu.:2001  
##  Max.   :3.000                                         Max.   :2010  
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000
# summary of x and y 
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
# Create a histogram for X (GrLivArea)
histogram_x <- ggplot(train, aes(x = X)) +
  geom_histogram(binwidth = 50, fill = "lightblue", color = "black") +
  labs(title = "Histogram of X (GrLivArea)", x = "GrLivArea", y = "Frequency") +
  theme_minimal()

# Create a histogram for Y (SalePrice)
histogram_y <- ggplot(train, aes(x = Y)) +
  geom_histogram(binwidth = 50000, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Y (SalePrice)", x = "SalePrice", y = "Frequency") +
  theme_minimal()

# Create a density plot for X (GrLivArea)
density_x <- ggplot(train, aes(x = X)) +
  geom_density(fill = "lightblue", color = "black") +
  labs(title = "Density Plot of X (GrLivArea)", x = "GrLivArea", y = "Density") +
  theme_minimal()

# Create a density plot for Y (SalePrice)
density_y <- ggplot(train, aes(x = Y)) +
  geom_density(fill = "lightgreen", color = "black") +
  labs(title = "Density Plot of Y (SalePrice)", x = "SalePrice", y = "Density") +
  theme_minimal()

# Create a scatter plot of X vs Y
scatter_plot <- ggplot(train, aes(x = X, y = Y)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(title = "Scatter Plot of X vs Y", x = "GrLivArea", y = "SalePrice") +
  theme_minimal()


gridExtra::grid.arrange(histogram_x, histogram_y, density_x, density_y, scatter_plot, nrow = 3)

# Step 1: 95% Confidence Interval for the Difference in Mean
mean_X <- mean(train$GrLivArea, na.rm = TRUE)
mean_Y <- mean(train$SalePrice, na.rm = TRUE)
sd_X <- sd(train$GrLivArea, na.rm = TRUE)
sd_Y <- sd(train$SalePrice, na.rm = TRUE)
n_X <- sum(!is.na(train$GrLivArea))
n_Y <- sum(!is.na(train$SalePrice))

diff_mean <- mean_X - mean_Y
se_diff_mean <- sqrt((sd_X^2 / n_X) + (sd_Y^2 / n_Y))
margin_error <- qt(0.975, df = n_X + n_Y - 2) * se_diff_mean
CI_diff_mean <- c(diff_mean - margin_error, diff_mean + margin_error)

# Step 2: Correlation Matrix
correlation_matrix <- cor(train[, c("GrLivArea", "SalePrice")], use = "complete.obs")



# Step 1: Invert the Correlation Matrix
precision_matrix <- solve(correlation_matrix)

# Step 2: Multiply the Correlation Matrix by the Precision Matrix
result1 <- correlation_matrix %*% precision_matrix

# Step 3: Multiply the Precision Matrix by the Correlation Matrix
result2 <- precision_matrix %*% correlation_matrix

# Step 4: Principal Components Analysis (PCA)
pca_result <- prcomp(train[, c("GrLivArea", "SalePrice")], scale. = TRUE)

# Step 5: Interpretation and Discussion
summary(pca_result)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.3071 0.5398
## Proportion of Variance 0.8543 0.1457
## Cumulative Proportion  0.8543 1.0000
# Step 3: Hypothesis Test for Correlation
Z <- 0.5 * log((1 + correlation_matrix[1, 2]) / (1 - correlation_matrix[1, 2]))
SE_Z <- 1 / sqrt(n_X - 3)
margin_error_Z <- qnorm(0.995) * SE_Z
CI_correlation <- tanh(c(Z - margin_error_Z, Z + margin_error_Z))

# Print results
cat("95% Confidence Interval for the Difference in Mean:", CI_diff_mean, "\n")
## 95% Confidence Interval for the Difference in Mean: -183482.5 -175329
cat("Correlation Matrix for GrLivArea and SalePrice:", "\n")
## Correlation Matrix for GrLivArea and SalePrice:
print(correlation_matrix)
##           GrLivArea SalePrice
## GrLivArea 1.0000000 0.7086245
## SalePrice 0.7086245 1.0000000
cat("Hypothesis Test for Correlation (GrLivArea and SalePrice):", "\n")
## Hypothesis Test for Correlation (GrLivArea and SalePrice):
cat("H0: Correlation = 0, HA: Correlation ≠ 0", "\n")
## H0: Correlation = 0, HA: Correlation ≠ 0
cat("99% Confidence Interval for Correlation:", CI_correlation, "\n")
## 99% Confidence Interval for Correlation: 0.6733974 0.7406408

The univariate plots show us how the data from X and Y are distributed and presence of outlines . The scatterplot visualize the relationships between the the variables of X and Y. the correlation matrix shows a strong relationship among the variables with correlation of 0.70.

The hypothesis test show there is strong relationship among the variables. we re 99% confident that the true correlation coefficient lie with our range.

# Scree plot to visualize variance explained by each principal component
scree_plot <- ggplot(data.frame(PC = 1:length(pca_result$sdev), Variance = pca_result$sdev^2 / sum(pca_result$sdev^2)), 
                     aes(x = PC, y = Variance)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
  labs(title = "Scree Plot", x = "Principal Component", y = "Proportion of Variance Explained") +
  theme_minimal()

# Biplot to visualize relationships between original variables and principal components
biplot <- ggbiplot::ggbiplot(pca_result, labels = c("GrLivArea", "SalePrice"),
                             ellipse = TRUE, circle = TRUE, var.axes = FALSE) +
  ggtitle("Biplot of PCA") +
  theme_minimal()

# Arrange plots in a grid
library(gridExtra)
grid.arrange(scree_plot, biplot, nrow = 1)

The Scree Plot represent the eigen values obtained by the Principal component Analysis. The biplot shows the relationships between the original variables (X and Y) and the principal components. The direction and length of the arrows represent the strength and direction of the relationship between the variables and the principal components.

For Principal Component 1 (PC1):

We have Standard deviation 1.3071, Proportion of Variance of 0.8543 and a Cumulative Proportion of 0.8543. PCI has a high standard deviation which explain PC1 captures a significant amount of variability in the data set. PC1 could represent the primary factor influencing the observations in the data set. PC2 captures additional, less dominant patterns or trends in the data.

# Step 1: Shift the skewed variable
shifted_variable <- train$GrLivArea- min(train$GrLivArea) + 1

# Step 2: Fit exponential distribution
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit_exp <- fitdistr(shifted_variable, densfun = "exponential")

# Step 3: Obtain the optimal value of λ
lambda <- fit_exp$estimate

# Step 4: Generate 1000 samples from the exponential distribution
samples <- rexp(1000, lambda)

# Step 5: Plot histograms for comparison
par(mfrow = c(1, 2)) # Set up a 1x2 grid for plots
hist(shifted_variable, main = "Original Variable", xlab = "Value", col = "skyblue", border = "white")
hist(samples, main = "Exponential Distribution", xlab = "Value", col = "lightgreen", border = "white")

# Step 6: Calculate percentiles using CDF of exponential distribution
percentile_5 <- qexp(0.05, lambda)
percentile_95 <- qexp(0.95, lambda)

# Step 7: Compute 95% confidence interval from empirical data assuming normality
mean_var <- mean(shifted_variable)
sd_var <- sd(shifted_variable)
CI <- c(mean_var - 1.96 * (sd_var / sqrt(length(shifted_variable))), 
        mean_var + 1.96 * (sd_var / sqrt(length(shifted_variable))))

# Step 8: Calculate empirical 5th and 95th percentiles
empirical_percentile_5 <- quantile(shifted_variable, 0.05)
empirical_percentile_95 <- quantile(shifted_variable, 0.95)


# Create a data frame
empirical_percentiles_df <- data.frame(Percentile = c("5th", "95th"),
                                       Value = c(empirical_percentile_5, empirical_percentile_95))
empirical_percentiles_df
##     Percentile  Value
## 5%         5th  515.0
## 95%       95th 2133.1

The exponential data is much more skewed. The simulated data is not a great fit. we need to use better techniques to improve the skewness.

Modeling

we use correlation to pick up the most relevant features to predict Sale Price of a house. For every one-unit increase in Overall quality and finish of the house (which is likely a rating of overall material and finish of the house), the Sale Price is estimated to increase by \(18,566.79\), holding all other variables constant. House with garages that hold more than one cars and additional storage will increase by \(17,578.73\). the model did pretty good. we were able to achieve a R-squared value of 0.8128 suggests that approximately 81.3% of the variability in Sale Price is explained by the predictor variables included in the model. we see that over all quality of the house, the year the house was built and garage space have significant impacts on the Sale Price of the house.

we discovers some outliers when plotting fitted and residual. we see random individual points that fall far from the main cluster of points which means that these outliers could represent observations that are poorly explained by the model or data points with unusual characteristics that require more investigations.

we were able to acheive 0.67155 on kaggle. we are have to improve. you can use other evaluation techniques or use the Principal Component Analysis (PCA) analysis to to reduce the dimensionality of data while preserving as much variance as possible. it is also a technique that remove these outliers.

# modeling 

#see distribution 
library(dplyr)
ggplot(train,aes(x=SalePrice, y= GrLivArea))+geom_point()

# preprocessing data 
set.seed(123)


# transform data to factor and numeric 
training <- train %>% mutate_if(is.character, as.factor)

testing <- test %>% mutate_if(is.character, as.factor)

# look at the dimetion data set
cat('training set has',dim(training),'testing set has ', dim(testing))
## training set has 1460 81 testing set has  1459 80
training %>% glimpse()
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage   <dbl> 65, 80, 68, 60, 84, 85, 75, 0, 51, 50, 70, 85, 0, 91, 0,…
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea    <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <fct> 0, TA, TA, Gd, TA, 0, Gd, TA, TA, TA, 0, Gd, 0, Gd, Fa, …
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt   <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fence         <fct> 0, 0, 0, 0, 0, MnPrv, 0, 0, 0, 0, 0, 0, 0, 0, GdWo, GdPr…
## $ MiscFeature   <fct> 0, 0, 0, 0, 0, Shed, 0, Shed, 0, 0, 0, 0, 0, 0, 0, 0, Sh…
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
# check if there is any na 
train %>% 
  summarize_all(~ sum(is.na(.))) %>% 
  glimpse()
## Rows: 1
## Columns: 81
## $ Id            <int> 0
## $ MSSubClass    <int> 0
## $ MSZoning      <int> 0
## $ LotFrontage   <int> 0
## $ LotArea       <int> 0
## $ Street        <int> 0
## $ Alley         <int> 0
## $ LotShape      <int> 0
## $ LandContour   <int> 0
## $ Utilities     <int> 0
## $ LotConfig     <int> 0
## $ LandSlope     <int> 0
## $ Neighborhood  <int> 0
## $ Condition1    <int> 0
## $ Condition2    <int> 0
## $ BldgType      <int> 0
## $ HouseStyle    <int> 0
## $ OverallQual   <int> 0
## $ OverallCond   <int> 0
## $ YearBuilt     <int> 0
## $ YearRemodAdd  <int> 0
## $ RoofStyle     <int> 0
## $ RoofMatl      <int> 0
## $ Exterior1st   <int> 0
## $ Exterior2nd   <int> 0
## $ MasVnrType    <int> 0
## $ MasVnrArea    <int> 0
## $ ExterQual     <int> 0
## $ ExterCond     <int> 0
## $ Foundation    <int> 0
## $ BsmtQual      <int> 0
## $ BsmtCond      <int> 0
## $ BsmtExposure  <int> 0
## $ BsmtFinType1  <int> 0
## $ BsmtFinSF1    <int> 0
## $ BsmtFinType2  <int> 0
## $ BsmtFinSF2    <int> 0
## $ BsmtUnfSF     <int> 0
## $ TotalBsmtSF   <int> 0
## $ Heating       <int> 0
## $ HeatingQC     <int> 0
## $ CentralAir    <int> 0
## $ Electrical    <int> 0
## $ X1stFlrSF     <int> 0
## $ X2ndFlrSF     <int> 0
## $ LowQualFinSF  <int> 0
## $ GrLivArea     <int> 0
## $ BsmtFullBath  <int> 0
## $ BsmtHalfBath  <int> 0
## $ FullBath      <int> 0
## $ HalfBath      <int> 0
## $ BedroomAbvGr  <int> 0
## $ KitchenAbvGr  <int> 0
## $ KitchenQual   <int> 0
## $ TotRmsAbvGrd  <int> 0
## $ Functional    <int> 0
## $ Fireplaces    <int> 0
## $ FireplaceQu   <int> 0
## $ GarageType    <int> 0
## $ GarageYrBlt   <int> 0
## $ GarageFinish  <int> 0
## $ GarageCars    <int> 0
## $ GarageArea    <int> 0
## $ GarageQual    <int> 0
## $ GarageCond    <int> 0
## $ PavedDrive    <int> 0
## $ WoodDeckSF    <int> 0
## $ OpenPorchSF   <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch    <int> 0
## $ ScreenPorch   <int> 0
## $ PoolArea      <int> 0
## $ PoolQC        <int> 0
## $ Fence         <int> 0
## $ MiscFeature   <int> 0
## $ MiscVal       <int> 0
## $ MoSold        <int> 0
## $ YrSold        <int> 0
## $ SaleType      <int> 0
## $ SaleCondition <int> 0
## $ SalePrice     <int> 0
theme_set(theme_classic())

ggplot(data=training, aes(SalePrice)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(train$SalesPrices)
## Length  Class   Mode 
##      0   NULL   NULL
training_numerical <- training %>% select_if(is.numeric)
testing_numerical <- testing %>% select_if(is.numeric)
dim(training_numerical)
## [1] 1460   38
dim(testing_numerical)
## [1] 1459   37
#feature selection 
# Calculate the correlation matrix
cor_matrix <- cor(training_numerical)

# Plot the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)

# regression 
reg <-  lm(SalePrice ~.,data = training_numerical)
summary(reg)
## 
## Call:
## lm(formula = SalePrice ~ ., data = training_numerical)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -473606  -16047   -2186   14258  298649 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.843e+05  1.401e+06   0.346 0.729585    
## Id            -1.050e+00  2.168e+00  -0.485 0.628079    
## MSSubClass    -1.673e+02  2.649e+01  -6.315 3.61e-10 ***
## LotFrontage    8.827e+00  2.852e+01   0.310 0.756985    
## LotArea        3.917e-01  9.990e-02   3.921 9.23e-05 ***
## OverallQual    1.731e+04  1.181e+03  14.660  < 2e-16 ***
## OverallCond    5.113e+03  1.023e+03   5.000 6.45e-07 ***
## YearBuilt      3.381e+02  6.044e+01   5.594 2.65e-08 ***
## YearRemodAdd   1.221e+02  6.602e+01   1.849 0.064638 .  
## MasVnrArea     2.808e+01  5.918e+00   4.744 2.31e-06 ***
## BsmtFinSF1     1.861e+01  4.632e+00   4.019 6.16e-05 ***
## BsmtFinSF2     9.144e+00  7.003e+00   1.306 0.191873    
## BsmtUnfSF      8.506e+00  4.168e+00   2.041 0.041452 *  
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.686e+01  5.726e+00   8.183 6.07e-16 ***
## X2ndFlrSF      4.810e+01  4.918e+00   9.779  < 2e-16 ***
## LowQualFinSF   1.779e+01  1.968e+01   0.904 0.366030    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   8.549e+03  2.594e+03   3.295 0.001008 ** 
## BsmtHalfBath   1.709e+03  4.054e+03   0.421 0.673467    
## FullBath       3.234e+03  2.800e+03   1.155 0.248390    
## HalfBath      -1.913e+03  2.642e+03  -0.724 0.469234    
## BedroomAbvGr  -1.027e+04  1.680e+03  -6.114 1.25e-09 ***
## KitchenAbvGr  -1.576e+04  5.195e+03  -3.033 0.002466 ** 
## TotRmsAbvGrd   5.005e+03  1.228e+03   4.076 4.84e-05 ***
## Fireplaces     4.075e+03  1.757e+03   2.319 0.020556 *  
## GarageYrBlt   -1.457e+01  2.683e+00  -5.428 6.67e-08 ***
## GarageCars     1.569e+04  2.974e+03   5.275 1.53e-07 ***
## GarageArea     5.001e+00  9.716e+00   0.515 0.606854    
## WoodDeckSF     2.579e+01  7.926e+00   3.254 0.001164 ** 
## OpenPorchSF   -6.285e+00  1.506e+01  -0.417 0.676420    
## EnclosedPorch  1.162e+01  1.671e+01   0.695 0.487055    
## X3SsnPorch     2.023e+01  3.114e+01   0.650 0.515997    
## ScreenPorch    5.771e+01  1.704e+01   3.387 0.000726 ***
## PoolArea      -3.215e+01  2.354e+01  -1.366 0.172244    
## MiscVal       -4.808e-01  1.844e+00  -0.261 0.794330    
## MoSold        -4.188e+01  3.420e+02  -0.122 0.902546    
## YrSold        -7.129e+02  6.964e+02  -1.024 0.306203    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34450 on 1424 degrees of freedom
## Multiple R-squared:  0.8164, Adjusted R-squared:  0.8119 
## F-statistic: 180.9 on 35 and 1424 DF,  p-value: < 2.2e-16
# Select the predictor variables for the regression model
predictors <- c('MSSubClass',
                'LotArea', 
                'OverallQual',
               'OverallCond',
                'YearBuilt',
                'MasVnrArea',
                'BsmtFinSF1',
                'X1stFlrSF', 
                'X2ndFlrSF',
                'GrLivArea',
                'BsmtFullBath',
                'BedroomAbvGr',
                'KitchenAbvGr',
                'TotRmsAbvGrd',
                'GarageYrBlt',
                'GarageCars',
                'OpenPorchSF',
                'ScreenPorch',
                'PoolArea')
# Create a new data frame with the predictor variables and the response variable
regression_data <- training[-1, c(predictors, "SalePrice")]
# Remove rows with missing values
regression_data <- na.omit(regression_data)
# Fit the multiple regression model
model <- lm(SalePrice ~ ., data = regression_data)

# Print the model summary
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = regression_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -475371  -15798   -2237   14500  290412 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.379e+05  8.896e+04  -9.419  < 2e-16 ***
## MSSubClass   -1.662e+02  2.570e+01  -6.465 1.39e-10 ***
## LotArea       4.604e-01  9.822e-02   4.688 3.02e-06 ***
## OverallQual   1.857e+04  1.114e+03  16.664  < 2e-16 ***
## OverallCond   5.894e+03  9.172e+02   6.427 1.77e-10 ***
## YearBuilt     4.021e+02  4.504e+01   8.929  < 2e-16 ***
## MasVnrArea    2.734e+01  5.842e+00   4.680 3.14e-06 ***
## BsmtFinSF1    1.130e+01  2.951e+00   3.829 0.000134 ***
## X1stFlrSF     3.887e+01  1.987e+01   1.956 0.050685 .  
## X2ndFlrSF     3.022e+01  1.963e+01   1.539 0.123919    
## GrLivArea     2.073e+01  1.964e+01   1.055 0.291505    
## BsmtFullBath  8.904e+03  2.353e+03   3.784 0.000161 ***
## BedroomAbvGr -1.060e+04  1.630e+03  -6.500 1.11e-10 ***
## KitchenAbvGr -1.879e+04  5.043e+03  -3.727 0.000202 ***
## TotRmsAbvGrd  5.076e+03  1.221e+03   4.156 3.44e-05 ***
## GarageYrBlt  -1.531e+01  2.618e+00  -5.848 6.15e-09 ***
## GarageCars    1.758e+04  2.046e+03   8.594  < 2e-16 ***
## OpenPorchSF  -5.666e+00  1.482e+01  -0.382 0.702278    
## ScreenPorch   5.151e+01  1.654e+01   3.114 0.001884 ** 
## PoolArea     -2.762e+01  2.330e+01  -1.185 0.236207    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34610 on 1439 degrees of freedom
## Multiple R-squared:  0.8128, Adjusted R-squared:  0.8103 
## F-statistic: 328.8 on 19 and 1439 DF,  p-value: < 2.2e-16
# Read the sample_submission file
sample_submission <- read.csv("https://raw.githubusercontent.com/joewarner89/DATA-605-Computational-Mathematics/main/project/sample_submission.csv")


# Create a new data frame with only "Id" column
predictions_df <- data.frame(ID = sample_submission$Id)


# Predict the SalePrice using your regression model (replace `model` with your actual model)
predictions_df$SalePrice <- predict(model, newdata = regression_data)


res <- resid(model)
#produce residual vs. fitted plot
plot(fitted(model), res)

#add a horizontal line at 0 
abline(0,0)

#create Q-Q plot for residuals
qqnorm(res)

#add a straight diagonal line to the plot
qqline(res) 

# Write the predictions to a CSV file
write.csv(predictions_df, file = "predictions.csv", row.names = FALSE)
# Verify the number of rows in the predictions file
num_rows <- nrow(predictions_df)
print(num_rows)  # Should be 1459
## [1] 1459
kaggle submission
kaggle submission