#Itroduction: For the final exam we need to import the train data from https://www.kaggle.com/c/house-prices-advanced-regression-techniques and register on Kaggle. I started preparing the libraries, then I imported the data as it showing bellow:

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tinytex)
## Warning: package 'tinytex' was built under R version 4.3.3
library(ggplot2)
datahouse= read.csv("C:/Users/Chafiaa/Downloads/train.csv")
head(datahouse)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000
summary(datahouse)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

The train data that I imported has 81 columns and 1460 rows.

#Find X variable that is sekwed to the right:

hist(datahouse$GarageArea)

hist(datahouse$LotFrontage)

hist(datahouse$WoodDeckSF)# X variable

X=datahouse$WoodDeckSF
Y=datahouse$SalePrice #the best dependent variable to choose
quantile(X)
##   0%  25%  50%  75% 100% 
##    0    0    0  168  857
quantile(Y)
##     0%    25%    50%    75%   100% 
##  34900 129975 163000 214000 755000

#probability:

a.P(X>.75 | Y>0.5)

P(X > .75 | Y > .50) P(A) = P(X > .75) = .25 P(B) = P(Y > .50) = .50 P(A|B) = P(B and A) / P(B) = (.50 * .25 / .50) = .50

#b.P(X>0.75, Y>0.5) P(A) = P(X > .75) = .25 P(B) = P(Y > .50) = .50 P(A,B) = P(A) + P(B) = (.25 + .50) = .75

#c.P(X<0.75 | Y>0.5)

P(A) = P(X < .75) = .75 P(B) = P(Y > .50) = .50 P(A|B) = P(B and A) / P(B) = (.50 * .75 / .50) = .75

#Splitting the data doesn’t change the relationship.

count(subset(datahouse, ( X <= 168 & Y <= 163000)))
##     n
## 1 624
count(subset(datahouse, ( X <= 168 & Y > 163000)))
##     n
## 1 480
count(subset(datahouse, ( X > 168 & Y <= 163000)))
##     n
## 1 108
count(subset(datahouse, ( X > 168 & Y > 163000)))
##     n
## 1 248

I attached the table of values in a separate file because I did it manually .

#P(A|B)=P(A)P(B): = (P(B) * P(A))/P(B) P(A) = 356/1460 = .24 P(B) = 728/1460 = .50 = (.50 * .24)/.50 = .24

P(A) * P(B) = .24 * .5 = .12

chisq.test(matrix(c(624,480,108,248), ncol=2))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix(c(624, 480, 108, 248), ncol = 2)
## X-squared = 72.785, df = 1, p-value < 2.2e-16

#Descriptive and Inferential Statistics:

datahouse1 = datahouse %>% 
subset(select = c("Functional", "LotShape", "LotFrontage","LotArea"))# 02mun +02 categorical variables 
glimpse(datahouse1)
## Rows: 1,460
## Columns: 4
## $ Functional  <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "M…
## $ LotShape    <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", "R…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, NA…
## $ LotArea     <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120,…
head(datahouse1)
##   Functional LotShape LotFrontage LotArea
## 1        Typ      Reg          65    8450
## 2        Typ      Reg          80    9600
## 3        Typ      IR1          68   11250
## 4        Typ      IR1          60    9550
## 5        Typ      IR1          84   14260
## 6        Typ      IR1          85   14115
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(datahouse1) + theme_bw()
## Warning: Removed 259 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 259 rows containing non-finite values (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 259 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 259 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing missing values (`geom_point()`).

#scoter plot of X and Y

ggplot(datahouse, aes(x = X, y = Y)) +
  geom_point() +
  labs(x = "wood deck", y = "sale price", title = "Scatter Plot of wood deck and sale price")

t_test <- t.test(X, Y, conf.level = 0.95)
print(t_test)
## 
##  Welch Two Sample t-test
## 
## data:  X and Y
## t = -86.973, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -184905.3 -176748.6
## sample estimates:
##    mean of x    mean of y 
##     94.24452 180921.19589
c <- cor(datahouse[,c("WoodDeckSF","SalePrice")])
c
##            WoodDeckSF SalePrice
## WoodDeckSF  1.0000000 0.3244134
## SalePrice   0.3244134 1.0000000
cor.test(datahouse$WoodDeckSF, datahouse$SalePrice,conf.level=.99)
## 
##  Pearson's product-moment correlation
## 
## data:  datahouse$WoodDeckSF and datahouse$SalePrice
## t = 13.096, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2627778 0.3834121
## sample estimates:
##       cor 
## 0.3244134

#If 100 sample distibutions are taken from the population, 99 of them are likely to have correlations within the confidence interval [0.2627778, 0.3834121], The p-value is negligible, so reject the null hypothesis that there is no correlation (0.3 away from 1).

#Linear Algebra:

p <- solve(c) #precision matrix
p
##            WoodDeckSF SalePrice
## WoodDeckSF   1.117623 -0.362572
## SalePrice   -0.362572  1.117623
p %*% c #precision * correlation
##            WoodDeckSF SalePrice
## WoodDeckSF          1         0
## SalePrice           0         1
c %*% p #correlation *precision
##            WoodDeckSF SalePrice
## WoodDeckSF          1         0
## SalePrice           0         1
pca= princomp(c, cor = TRUE)
pca
## Call:
## princomp(x = c, cor = TRUE)
## 
## Standard deviations:
##   Comp.1   Comp.2 
## 1.414214 0.000000 
## 
##  2  variables and  2 observations.
PC = pca$scores
cor_PC = cor(PC)
## Warning in cor(PC): the standard deviation is zero
cor_PC
##        Comp.1 Comp.2
## Comp.1      1     NA
## Comp.2     NA      1
summary_pca = summary(pca)
summary_pca
## Importance of components:
##                          Comp.1 Comp.2
## Standard deviation     1.414214      0
## Proportion of Variance 1.000000      0
## Cumulative Proportion  1.000000      1

#Discussion: 100% variance this means my 02 variables are pretty extreme case of linear dependence, which I find a little suprising!

#Calculus-Based Probability & Statistics:

library(MASS)
## Warning: package 'MASS' was built under R version 4.3.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
X_shifted = X + 1  
exp_fit <- fitdistr(X_shifted, "exponential")
lambda <- exp_fit$estimate  # Extract rate parameter λ
samples <- rexp(1000, rate = lambda)
par(mfrow = c(1, 2))
hist(X, main = "wood deck histogram", xlab = "wood deck", col = "purple")
hist(samples, main = "Ft Exponential Distribution", xlab = "Sample Values", col = "green")

data_per <- quantile(X, probs = c(0.05, 0.95))
data_per
##  5% 95% 
##   0 335
mean_X = mean(X)
std_X = sd(X)
n = length(X)
z = qnorm(1 - 0.05/2)
lower_ci <- mean_X - z * (std_X / sqrt(n))
upper_ci <- mean_X + z * (std_X / sqrt(n))
print(paste("95% Confidence Interval is: ", lower_ci, ", ", upper_ci))
## [1] "95% Confidence Interval is:  87.8153169957939 ,  100.673724100096"

95% confident interval helps us understand the typical size of wood deck.

#modeling

library(tidyverse)
library(MASS)
library(dplyr)

model=lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
OverallCond + YearBuilt + MasVnrArea + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, data = datahouse)

summary(model)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + MasVnrArea + X1stFlrSF + X2ndFlrSF + 
##     BsmtFullBath + BedroomAbvGr + GarageCars + WoodDeckSF + ScreenPorch + 
##     PoolArea, data = datahouse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -452876  -17230   -1448   13892  292879 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.548e+05  8.793e+04  -9.721  < 2e-16 ***
## MSSubClass   -1.907e+02  2.419e+01  -7.883 6.26e-15 ***
## LotArea       4.536e-01  1.001e-01   4.532 6.33e-06 ***
## OverallQual   1.958e+04  1.097e+03  17.860  < 2e-16 ***
## OverallCond   5.376e+03  9.265e+02   5.803 8.02e-09 ***
## YearBuilt     3.961e+02  4.495e+01   8.810  < 2e-16 ***
## MasVnrArea    3.242e+01  5.859e+00   5.533 3.73e-08 ***
## X1stFlrSF     7.086e+01  3.723e+00  19.033  < 2e-16 ***
## X2ndFlrSF     6.091e+01  3.329e+00  18.295  < 2e-16 ***
## BsmtFullBath  1.370e+04  1.924e+03   7.124 1.65e-12 ***
## BedroomAbvGr -7.986e+03  1.446e+03  -5.522 3.97e-08 ***
## GarageCars    1.048e+04  1.704e+03   6.147 1.02e-09 ***
## WoodDeckSF    2.697e+01  7.964e+00   3.386 0.000727 ***
## ScreenPorch   5.617e+01  1.685e+01   3.334 0.000878 ***
## PoolArea     -2.957e+01  2.348e+01  -1.260 0.208031    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35200 on 1437 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8048, Adjusted R-squared:  0.8029 
## F-statistic: 423.2 on 14 and 1437 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model)

model1=lm(formula = log(SalePrice) ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + X1stFlrSF + GrLivArea + BsmtFullBath + Fireplaces + GarageCars + ScreenPorch, data = datahouse)
summary(model1)
## 
## Call:
## lm(formula = log(SalePrice) ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + X1stFlrSF + GrLivArea + BsmtFullBath + 
##     Fireplaces + GarageCars + ScreenPorch, data = datahouse)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.03693 -0.06833  0.00320  0.08037  0.49918 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.288e+00  3.718e-01   8.844  < 2e-16 ***
## MSSubClass   -7.331e-04  1.022e-04  -7.171 1.18e-12 ***
## LotArea       2.035e-06  4.306e-07   4.728 2.49e-06 ***
## OverallQual   9.463e-02  4.637e-03  20.407  < 2e-16 ***
## OverallCond   5.655e-02  3.938e-03  14.362  < 2e-16 ***
## YearBuilt     3.671e-03  1.907e-04  19.249  < 2e-16 ***
## X1stFlrSF     5.062e-05  1.454e-05   3.481 0.000514 ***
## GrLivArea     2.430e-04  1.141e-05  21.287  < 2e-16 ***
## BsmtFullBath  7.341e-02  8.116e-03   9.045  < 2e-16 ***
## Fireplaces    4.503e-02  7.313e-03   6.157 9.57e-10 ***
## GarageCars    7.773e-02  7.233e-03  10.747  < 2e-16 ***
## ScreenPorch   2.960e-04  7.234e-05   4.092 4.51e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1506 on 1448 degrees of freedom
## Multiple R-squared:  0.8589, Adjusted R-squared:  0.8579 
## F-statistic: 801.5 on 11 and 1448 DF,  p-value: < 2.2e-16

#both models look good , high R^2 and QQ plot most values on the line.

par(mfrow=c(2,2))
plot(model1)

library(flextable)
## Warning: package 'flextable' was built under R version 4.3.3
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
housetest <- read.csv("C:/Users/Chafiaa/Downloads/test.csv")
pred = predict(model1, housetest) %>% 
  exp() %>% 
  cbind(housetest$Id, .) %>% 
  as.data.frame() %>% 
  set_names(c("Id","SalePrice"))

head(pred) %>% 
  flextable()

Id

SalePrice

1,461

118,686.3

1,462

142,179.6

1,463

163,074.8

1,464

187,737.6

1,465

186,498.0

1,466

174,919.8

pred %>% 
  replace(is.na(.), 0) %>% 
  write.csv("Housing_MarketValue.csv",row.names=F)