1 Problem 1.

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu =\sigma =\frac{(N+1)}{2}.\)

set.seed(246)
N <- 6             # N can be any number of your choosing greater than or equal to 6
n <- 10000        # 10,000 random uniform numbers 

mu <- sigma <- (N + 1)/2


df <- data.frame(X = runif(n, min=1, max=N), 
                 Y = rnorm(n, mean=mu, sd=sigma))
summary(df$X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   2.218   3.474   3.491   4.741   6.000

summary(df$Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -10.069   1.150   3.496   3.503   5.884  16.416

1.1 Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x <- median(df$X)
y <- as.numeric(quantile(df$Y)["25%"])

1. P(X>x | X>y) b. P(X>x, Y>y) c. P(X<x | X>y)

# a. P(X>x | X>y)
p_1 <- df %>%
  filter(df$X > x,df$X > y) %>%
  nrow() / n

P_2 <- df %>%
  filter(X > y) %>%
  nrow() / n

p_a <- p_1 / P_2   # P(X>x | X>y)

p_a

## [1] 0.5172771

# b. P(X>x, Y>y)
p_b <- df %>%
  filter(X > x,Y > y) %>%
  nrow() / n

p_b

## [1] 0.3705

# c. P(X<x | X>y)
p_1 <- df %>%
  filter(X < x,X > y) %>%
  nrow() / n

P_2 <- df %>%
  filter(X > y) %>%
  nrow() / n

p_c <- p_1 / P_2   #P(X<x | X>y)

p_c

## [1] 0.4827229

1.2 Marginal and Joint Probabilities

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

library(dplyr)

# Joint Probabilities:
df2 <- df %>%
  mutate(A = ifelse(X > x, " X greater than x", " X not greater than x"),
         B = ifelse(Y > y, " Y greater than y", " Y not greater than y")) %>%
  group_by(A, B) %>%
  summarise(count = n()) %>%
  mutate(probability = count / n)

## `summarise()` regrouping output by 'A' (override with `.groups` argument)

# Marginal Probabilities #A:
df2 <- df2 %>%
  ungroup() %>%
  group_by(A) %>%
  summarise(count = sum(count),
            probability = sum(probability)) %>%
  mutate(B = "Total") %>%
  bind_rows(df2)

## `summarise()` ungrouping output (override with `.groups` argument)

# Marginal Probabilities #B:
df2 <- df2 %>%
  ungroup() %>%
  group_by(B) %>%
  summarise(count = sum(count),
            probability = sum(probability)) %>%
  mutate(A = "Total") %>%
  bind_rows(df2)

## `summarise()` ungrouping output (override with `.groups` argument)

# Final Table:
df2 %>%
  select(- count) %>%
  spread(A, probability) %>%
  rename(" " = B)

## # A tibble: 3 x 4
##   ` `                     ` X greater than x` ` X not greater than x` Total
##   <chr>                                 <dbl>                   <dbl> <dbl>
## 1 " Y greater than y"                   0.370                   0.380  0.75
## 2 " Y not greater than y"               0.130                   0.120  0.25
## 3 "Total"                               0.5                     0.5    1

=> P(X>x and Y>y) is 0.370 P(X>x)P(Y>y) is 0.5 × 0.75 which is 0.375. They are not the same.

1.3 Fisher’s Exact Test and Chi Square Test

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

df3 <- df2 %>%
  filter(A != "Total",
         B != "Total") %>%
  select(-probability) %>%
  spread(A, count) %>%
  as.data.frame()

row.names(df3) <- df3$B

df3 <- df3 %>%
  select(-B) %>%
  as.matrix() 


fisher.test(df3)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  df3
## p-value = 0.03983
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8288777 0.9955989
## sample estimates:
## odds ratio 
##   0.908444

chisq.test(df3)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df3
## X-squared = 4.2245, df = 1, p-value = 0.03984

==> Fisher’s Exact Test is for is used when you have small cell sizes (less than 5). The Chi Square Test is used when the cell sizes are large. It would be appropriate in this case.

2 Problem 2.

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .

2.1 Descriptive and Inferential Statistics

==> We will load the data and get summary of the data:

# load data and select ignore the index variable
training_data <- read.csv("train.csv", header = TRUE, stringsAsFactors = FALSE)

# View and explore the data
dim(training_data)

## [1] 1460   81

# Summary
str(training_data)

## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

# view inital data
kable(head(training_data))

Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	X1stFlrSF	X2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	X3SsnPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
1	60	RL	65	8450	Pave	NA	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	150	856	GasA	Ex	Y	SBrkr	856	854	1710	1	0	2	1	3	1	Gd	8	Typ	0	NA	Attchd	2003	RFn	2	548	TA	TA	Y	0	61	0	0	NA	NA	NA	0	2	2008	WD	Normal	208500
2	20	RL	80	9600	Pave	NA	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	284	1262	GasA	Ex	Y	SBrkr	1262	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976	RFn	2	460	TA	TA	Y	298	0	0	0	NA	NA	NA	0	5	2007	WD	Normal	181500
3	60	RL	68	11250	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	434	920	GasA	Ex	Y	SBrkr	920	866	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001	RFn	2	608	TA	TA	Y	0	42	0	0	NA	NA	NA	0	9	2008	WD	Normal	223500
4	70	RL	60	9550	Pave	NA	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	540	756	GasA	Gd	Y	SBrkr	961	756	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998	Unf	3	642	TA	TA	Y	0	35	272	0	NA	NA	NA	0	2	2006	WD	Abnorml	140000
5	60	RL	84	14260	Pave	NA	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	490	1145	GasA	Ex	Y	SBrkr	1145	1053	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000	RFn	3	836	TA	TA	Y	192	84	0	0	NA	NA	NA	0	12	2008	WD	Normal	250000
6	50	RL	85	14115	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	Mitchel	Norm	Norm	1Fam	1.5Fin	5	5	1993	1995	Gable	CompShg	VinylSd	VinylSd	None	0	TA	TA	Wood	Gd	TA	No	GLQ	732	Unf	64	796	GasA	Ex	Y	SBrkr	796	566	1362	1	0	1	1	1	1	TA	5	Typ	0	NA	Attchd	1993	Unf	2	480	TA	TA	Y	40	30	0	320	NA	MnPrv	Shed	700	10	2009	WD	Normal	143000

==> We will reduce the number of columns to select few key columns for our analysis:

training_data <- training_data [ , c("Id", "LotArea","YearBuilt","YearRemodAdd",
                                     "BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","X1stFlrSF", 
                                     "X2ndFlrSF","GrLivArea", "GarageArea",
                                     "WoodDeckSF", "OpenPorchSF",  "SalePrice" )]

# View and explore the data
dim(training_data)

## [1] 1460   14

# Summary
str(training_data)

## 'data.frame':    1460 obs. of  14 variables:
##  $ Id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ BsmtFinSF1  : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtUnfSF   : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ X1stFlrSF   : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF   : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ GrLivArea   : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ GarageArea  : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ WoodDeckSF  : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ SalePrice   : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

# Summary of the  data: Univariate Summary Statistics
summary(training_data)

##        Id            LotArea         YearBuilt     YearRemodAdd 
##  Min.   :   1.0   Min.   :  1300   Min.   :1872   Min.   :1950  
##  1st Qu.: 365.8   1st Qu.:  7554   1st Qu.:1954   1st Qu.:1967  
##  Median : 730.5   Median :  9478   Median :1973   Median :1994  
##  Mean   : 730.5   Mean   : 10517   Mean   :1971   Mean   :1985  
##  3rd Qu.:1095.2   3rd Qu.: 11602   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :1460.0   Max.   :215245   Max.   :2010   Max.   :2010  
##    BsmtFinSF1       BsmtUnfSF       TotalBsmtSF       X1stFlrSF   
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.0   Min.   : 334  
##  1st Qu.:   0.0   1st Qu.: 223.0   1st Qu.: 795.8   1st Qu.: 882  
##  Median : 383.5   Median : 477.5   Median : 991.5   Median :1087  
##  Mean   : 443.6   Mean   : 567.2   Mean   :1057.4   Mean   :1163  
##  3rd Qu.: 712.2   3rd Qu.: 808.0   3rd Qu.:1298.2   3rd Qu.:1391  
##  Max.   :5644.0   Max.   :2336.0   Max.   :6110.0   Max.   :4692  
##    X2ndFlrSF      GrLivArea      GarageArea       WoodDeckSF    
##  Min.   :   0   Min.   : 334   Min.   :   0.0   Min.   :  0.00  
##  1st Qu.:   0   1st Qu.:1130   1st Qu.: 334.5   1st Qu.:  0.00  
##  Median :   0   Median :1464   Median : 480.0   Median :  0.00  
##  Mean   : 347   Mean   :1515   Mean   : 473.0   Mean   : 94.24  
##  3rd Qu.: 728   3rd Qu.:1777   3rd Qu.: 576.0   3rd Qu.:168.00  
##  Max.   :2065   Max.   :5642   Max.   :1418.0   Max.   :857.00  
##   OpenPorchSF       SalePrice     
##  Min.   :  0.00   Min.   : 34900  
##  1st Qu.:  0.00   1st Qu.:129975  
##  Median : 25.00   Median :163000  
##  Mean   : 46.66   Mean   :180921  
##  3rd Qu.: 68.00   3rd Qu.:214000  
##  Max.   :547.00   Max.   :755000

2.2 Plots

==> We can visualise the data using some plots as below:

training_data[, 2:14] %>%
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(color="black", fill="lightblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

==> Checking relations between variables:

pairs(training_data[, c("YearBuilt","YearRemodAdd","BsmtUnfSF","X1stFlrSF","X2ndFlrSF",
                        "GrLivArea", "GarageArea","SalePrice" ) ])

plot(training_data$YearBuilt,training_data$SalePrice)

plot(training_data$YearRemodAdd,training_data$SalePrice)

plot(training_data$BsmtUnfSF,training_data$SalePrice)

plot(training_data$X1stFlrSF,training_data$SalePrice)

plot(training_data$X2ndFlrSF,training_data$SalePrice)

plot(training_data$GrLivArea,training_data$SalePrice)

plot(training_data$GarageArea,training_data$SalePrice)

2.3 Correlation Matrix

=> Correlation matrix for THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

corr.data <- training_data[, c("GrLivArea", "GarageArea", "SalePrice")]
corr.matrix <- round(cor(corr.data),2)
corr.matrix

##            GrLivArea GarageArea SalePrice
## GrLivArea       1.00       0.47      0.71
## GarageArea      0.47       1.00      0.62
## SalePrice       0.71       0.62      1.00

cor.test(corr.data$GrLivArea, corr.data$GarageArea, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  corr.data$GrLivArea and corr.data$GarageArea
## t = 20.276, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4423993 0.4947713
## sample estimates:
##       cor 
## 0.4689975

cor.test(corr.data$GrLivArea, corr.data$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  corr.data$GrLivArea and corr.data$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

cor.test(corr.data$GarageArea, corr.data$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  corr.data$GarageArea and corr.data$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314

==> In all 3 tests we have a very small p value, therefore, we can reject the the null hypothesis. The true correlation is not 0 for any of the three pairs of variables.

==> We will now check the family-wise error rate

fwe_training_data <- 1-((1-0.2)^3)   # 3 is the number of tests and 0.02 is the ampha
fwe_training_data

## [1] 0.488

==> So the probability of a family-wise error is just over 49%. Which is a really high probability of committing a Type I error.

2.4 Linear Algebra and Correlation

==> Invert correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

precision.matrix <- solve(corr.matrix)
precision.matrix

##              GrLivArea  GarageArea  SalePrice
## GrLivArea   2.02241876 -0.09790136 -1.3752185
## GarageArea -0.09790136  1.62917066 -0.9405758
## SalePrice  -1.37521847 -0.94057584  2.5595621

round(corr.matrix %*% precision.matrix, 2)

##            GrLivArea GarageArea SalePrice
## GrLivArea          1          0         0
## GarageArea         0          1         0
## SalePrice          0          0         1

round(precision.matrix %*% corr.matrix, 2)

##            GrLivArea GarageArea SalePrice
## GrLivArea          1          0         0
## GarageArea         0          1         0
## SalePrice          0          0         1

lu.decomposition(corr.matrix)

## $L
##      [,1]      [,2] [,3]
## [1,] 1.00 0.0000000    0
## [2,] 0.47 1.0000000    0
## [3,] 0.71 0.3674753    1
## 
## $U
##      [,1]   [,2]      [,3]
## [1,]    1 0.4700 0.7100000
## [2,]    0 0.7791 0.2863000
## [3,]    0 0.0000 0.3906918

2.5 Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

min(training_data$GarageArea)

## [1] 0

hist(training_data$GarageArea, breaks = 20)

==> Running fitdistr to fit an exponential probability density function:

library(MASS)

## Warning: package 'MASS' was built under R version 3.6.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

df_fitdistr <- fitdistr(training_data$GarageArea, densfun = 'exponential')
lambda <- df_fitdistr$estimate
exp.dist <- rexp(1000, lambda)

==> We will now compare it with a histogram of the original variable:

par(mfrow=c(1,2))
hist(training_data$GarageArea, breaks = 20)
hist(exp.dist, breaks = 20)

==> Finding the 5th and 95th percentiles using the cumulative distribution function:

round(quantile(exp.dist, c(.05, .95)), 3)

##       5%      95% 
##   26.671 1364.474

==> Generating 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data :

conf_int <- t.test(training_data$GarageArea)
conf_int

## 
##  One Sample t-test
## 
## data:  training_data$GarageArea
## t = 84.528, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  462.0040 483.9563
## sample estimates:
## mean of x 
##  472.9801

round(quantile(training_data$GarageArea, c(.05, .95)), 3)

##    5%   95% 
##   0.0 850.1

==> The simulated data is not a good fit for the observed data in this case. it is much more skewed than our original data.

2.6 Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score

# Standardize predictors
means <- sapply(training_data[,2:13],mean)
stdev <- sapply(training_data[,2:13],sd)
training_data_scaled <- as.data.frame(scale(training_data[,2:13], center=means, scale=stdev))
training_data_scaled$SalePrice <- training_data$SalePrice
training_data_scaled$Id <- training_data$Id
head(training_data_scaled)

##       LotArea  YearBuilt YearRemodAdd  BsmtFinSF1   BsmtUnfSF TotalBsmtSF
## 1 -0.20707076  1.0506338    0.8783671  0.57522774 -0.94426706  -0.4591452
## 2 -0.09185490  0.1566800   -0.4294298  1.17159068 -0.64100836   0.4663051
## 3  0.07345481  0.9844150    0.8299302  0.09287536 -0.30153966  -0.3132614
## 4 -0.09686428 -1.8629933   -0.7200514 -0.49910256 -0.06164845  -0.6870887
## 5  0.37501979  0.9513056    0.7330564  0.46340969 -0.17480468   0.1996113
## 6  0.36049258  0.7195398    0.4908717  0.63223302 -1.13889578  -0.5959113
##     X1stFlrSF  X2ndFlrSF  GrLivArea  GarageArea WoodDeckSF OpenPorchSF
## 1 -0.79316202  1.1614536  0.3702066  0.35088009 -0.7519182  0.21642900
## 2  0.25705235 -0.7948909 -0.4823466 -0.06071021  1.6256378 -0.70424195
## 3 -0.62761099  1.1889432  0.5148362  0.63150985 -0.7519182 -0.07033736
## 4 -0.52155486  0.9369551  0.3835277  0.79053338 -0.7519182 -0.17598812
## 5 -0.04559563  1.6173231  1.2988806  1.69790291  0.7799299  0.56356723
## 6 -0.94836612  0.5017028 -0.2920446  0.03283304 -0.4327832 -0.25145296
##   SalePrice Id
## 1    208500  1
## 2    181500  2
## 3    223500  3
## 4    140000  4
## 5    250000  5
## 6    143000  6

==> MOdel 1 below:

attach(training_data_scaled)

model.1 <- lm(SalePrice ~ LotArea + YearBuilt + YearRemodAdd + BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + GrLivArea + GarageArea + WoodDeckSF + OpenPorchSF)
summary(model.1)

## 
## Call:
## lm(formula = SalePrice ~ LotArea + YearBuilt + YearRemodAdd + 
##     BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + 
##     GrLivArea + GarageArea + WoodDeckSF + OpenPorchSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -626565  -18103   -3396   14109  281540 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  180921.2     1070.4 169.030  < 2e-16 ***
## LotArea        3773.3     1155.6   3.265 0.001119 ** 
## YearBuilt     13846.4     1496.0   9.256  < 2e-16 ***
## YearRemodAdd  11793.7     1377.5   8.561  < 2e-16 ***
## BsmtFinSF1     7883.3     3204.7   2.460 0.014013 *  
## BsmtUnfSF      1408.7     3025.7   0.466 0.641591    
## TotalBsmtSF   10654.5     3469.5   3.071 0.002174 ** 
## X1stFlrSF     15002.4     8972.3   1.672 0.094725 .  
## X2ndFlrSF     16330.1     9986.8   1.635 0.102232    
## GrLivArea     15749.9    11843.8   1.330 0.183790    
## GarageArea    11974.5     1408.2   8.503  < 2e-16 ***
## WoodDeckSF     3831.2     1148.2   3.337 0.000869 ***
## OpenPorchSF     904.3     1158.8   0.780 0.435294    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40900 on 1447 degrees of freedom
## Multiple R-squared:  0.7371, Adjusted R-squared:  0.735 
## F-statistic: 338.2 on 12 and 1447 DF,  p-value: < 2.2e-16

==> We will now try to Remove Highest P-values One by One and try get to a optimized model:

==> Remove BsmtUnfSF , OpenPorchSF , X2ndFlrSF , GrLivArea , X1stFlrSF

model.final <- lm(SalePrice ~ LotArea + YearBuilt + YearRemodAdd + BsmtFinSF1 + TotalBsmtSF + GarageArea + WoodDeckSF )
summary(model.final)

## 
## Call:
## lm(formula = SalePrice ~ LotArea + YearBuilt + YearRemodAdd + 
##     BsmtFinSF1 + TotalBsmtSF + GarageArea + WoodDeckSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -517656  -27613   -4338   19144  416603 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    180921       1308 138.351  < 2e-16 ***
## LotArea          8194       1387   5.909 4.28e-09 ***
## YearBuilt        7714       1773   4.351 1.45e-05 ***
## YearRemodAdd    18103       1649  10.979  < 2e-16 ***
## BsmtFinSF1       4198       1555   2.700  0.00702 ** 
## TotalBsmtSF     22942       1732  13.242  < 2e-16 ***
## GarageArea      23545       1626  14.477  < 2e-16 ***
## WoodDeckSF       7436       1385   5.368 9.28e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49970 on 1452 degrees of freedom
## Multiple R-squared:  0.6063, Adjusted R-squared:  0.6044 
## F-statistic: 319.4 on 7 and 1452 DF,  p-value: < 2.2e-16

==> Our final model has all very low p-values and a moderately OK R2 at 0.6063.

par(mfrow=c(2,2))
plot(model.final)

2.7 Test Data

==> Let’s see how it does on the test data…

#Load the data and remove columns same as our training data
testing_data <- read.csv("test.csv")

#reducing columns as done with training data:
testing_data <- testing_data [ , c("Id", "LotArea","YearBuilt","YearRemodAdd",
                                     "BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","X1stFlrSF", 
                                     "X2ndFlrSF","GrLivArea", "GarageArea",
                                     "WoodDeckSF", "OpenPorchSF" )]

# View and explore the data
dim(testing_data)

## [1] 1459   13

# Summary
str(testing_data)

## 'data.frame':    1459 obs. of  13 variables:
##  $ Id          : int  1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
##  $ LotArea     : int  11622 14267 13830 9978 5005 10000 7980 8402 10176 8400 ...
##  $ YearBuilt   : int  1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
##  $ YearRemodAdd: int  1961 1958 1998 1998 1992 1994 2007 1998 1990 1970 ...
##  $ BsmtFinSF1  : int  468 923 791 602 263 0 935 0 637 804 ...
##  $ BsmtUnfSF   : int  270 406 137 324 1017 763 233 789 663 0 ...
##  $ TotalBsmtSF : int  882 1329 928 926 1280 763 1168 789 1300 882 ...
##  $ X1stFlrSF   : int  896 1329 928 926 1280 763 1187 789 1341 882 ...
##  $ X2ndFlrSF   : int  0 0 701 678 0 892 0 676 0 0 ...
##  $ GrLivArea   : int  896 1329 1629 1604 1280 1655 1187 1465 1341 882 ...
##  $ GarageArea  : int  730 312 482 470 506 440 420 393 506 525 ...
##  $ WoodDeckSF  : int  140 393 212 360 0 157 483 0 192 240 ...
##  $ OpenPorchSF : int  0 36 34 36 82 84 21 75 0 0 ...

# Summary of the  data: Univariate Summary Statistics
summary(testing_data)

##        Id          LotArea        YearBuilt     YearRemodAdd    BsmtFinSF1    
##  Min.   :1461   Min.   : 1470   Min.   :1879   Min.   :1950   Min.   :   0.0  
##  1st Qu.:1826   1st Qu.: 7391   1st Qu.:1953   1st Qu.:1963   1st Qu.:   0.0  
##  Median :2190   Median : 9399   Median :1973   Median :1992   Median : 350.5  
##  Mean   :2190   Mean   : 9819   Mean   :1971   Mean   :1984   Mean   : 439.2  
##  3rd Qu.:2554   3rd Qu.:11518   3rd Qu.:2001   3rd Qu.:2004   3rd Qu.: 753.5  
##  Max.   :2919   Max.   :56600   Max.   :2010   Max.   :2010   Max.   :4010.0  
##                                                               NA's   :1       
##    BsmtUnfSF       TotalBsmtSF     X1stFlrSF        X2ndFlrSF      GrLivArea   
##  Min.   :   0.0   Min.   :   0   Min.   : 407.0   Min.   :   0   Min.   : 407  
##  1st Qu.: 219.2   1st Qu.: 784   1st Qu.: 873.5   1st Qu.:   0   1st Qu.:1118  
##  Median : 460.0   Median : 988   Median :1079.0   Median :   0   Median :1432  
##  Mean   : 554.3   Mean   :1046   Mean   :1156.5   Mean   : 326   Mean   :1486  
##  3rd Qu.: 797.8   3rd Qu.:1305   3rd Qu.:1382.5   3rd Qu.: 676   3rd Qu.:1721  
##  Max.   :2140.0   Max.   :5095   Max.   :5095.0   Max.   :1862   Max.   :5095  
##  NA's   :1        NA's   :1                                                    
##    GarageArea       WoodDeckSF       OpenPorchSF    
##  Min.   :   0.0   Min.   :   0.00   Min.   :  0.00  
##  1st Qu.: 318.0   1st Qu.:   0.00   1st Qu.:  0.00  
##  Median : 480.0   Median :   0.00   Median : 28.00  
##  Mean   : 472.8   Mean   :  93.17   Mean   : 48.31  
##  3rd Qu.: 576.0   3rd Qu.: 168.00   3rd Qu.: 72.00  
##  Max.   :1488.0   Max.   :1424.00   Max.   :742.00  
##  NA's   :1

==> running against the model:

# Standardize test predictors
testing_data_scaled <- as.data.frame(scale(testing_data[,2:13], center=means, scale=stdev))
testing_data_scaled$SalePrice <- testing_data$SalePrice
testing_data_scaled$Id <- testing_data$Id
head(testing_data_scaled)

##       LotArea  YearBuilt YearRemodAdd  BsmtFinSF1  BsmtUnfSF TotalBsmtSF
## 1  0.11072464 -0.3399610   -1.1559837  0.05341016 -0.6726921  -0.3998799
## 2  0.37572111 -0.4392892   -1.3012945  1.05100259 -0.3649071   0.6190272
## 3  0.33193908  0.8519774    0.6361825  0.76159116 -0.9736877  -0.2950259
## 4 -0.05398395  0.8850868    0.6361825  0.34720661 -0.5504834  -0.2995848
## 5 -0.55221739  0.6864304    0.3455610 -0.39605455  1.0178620   0.5073350
## 6 -0.05177982  0.7195398    0.4424348 -0.97268490  0.4430284  -0.6711326
##    X1stFlrSF  X2ndFlrSF  GrLivArea  GarageArea WoodDeckSF OpenPorchSF   Id
## 1 -0.6896926 -0.7948909 -1.1788522  1.20212368  0.3650544  -0.7042419 1461
## 2  0.4303636 -0.7948909 -0.3548443 -0.75293027  2.3835835  -0.1608952 1462
## 3 -0.6069171  0.8109610  0.2160619  0.04218737  0.9394975  -0.1910811 1463
## 4 -0.6120906  0.7582726  0.1684864 -0.01393859  2.1202971  -0.1608952 1464
## 5  0.3036136 -0.7948909 -0.4480923  0.15443927 -0.7519182   0.5333813 1465
## 6 -1.0337284  1.2485041  0.2655405 -0.15425346  0.5006868   0.5635672 1466

2.8 Test Data Prediction

test_predictions <- predict(model.final,newdata=testing_data_scaled)
test_predictions <- data.frame(as.vector(test_predictions))
test_predictions$Id <- testing_data_scaled$Id
test_predictions[,c(1,2)] <- test_predictions[,c(2,1)]
colnames(test_predictions) <- c("Id", "SalePrice")
test_predictions[is.na(test_predictions)] <- 0
head(test_predictions)

##     Id SalePrice
## 1 1461  180347.4
## 2 1462  175663.6
## 3 1463  206138.5
## 4 1464  208846.6
## 5 1465  195968.7
## 6 1466  174668.4

==> Writing to csv file for submission:

write.csv(test_predictions,'test_predictions.csv', row.names=FALSE)

2.9 Kaggle Score

Kaggle.com User Name: vinayakkamath92
Kaggle.com Score: 0.49672
Kaggle.com Rank: 4973

Kaggle Score ScreenShot

Data605-Final Project-Computational Mathematics

Vinayak Kamath

12/19/2020