Load the libraries
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0.9000 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.0 v stringr 1.2.0
## v ggplot2 3.0.0.9000 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.3
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(pracma)
## Warning: package 'pracma' was built under R version 3.4.4
##
## Attaching package: 'pracma'
## The following object is masked from 'package:purrr':
##
## cross
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
set.seed(123)
X <- runif(10000, min = 1, max = 8)
Y <- rnorm(10000 , (8+1)/2)
x <- median(X)
y <- summary(Y)[2][[1]]
sum(X>x & X > y)/sum(X>y)
## [1] 0.8368201
sum(X>x & Y>y)/length(X)
## [1] 0.3756
sum(X<x & X > y)/sum(X>y)
## [1] 0.1631799
tabl <- c(sum(X<x & Y < y),
sum(X < x & Y == y),
sum(X < x & Y > y))
tabl <- rbind(tabl,
c(sum(X==x & Y < y),
sum(X == x & Y == y),
sum(X == x & Y > y))
)
tabl <- rbind(tabl,
c(sum(X>x & Y < y),
sum(X > x & Y == y),
sum(X > x & Y > y))
)
tabl <- cbind(tabl, tabl[,1] + tabl[,2] + tabl[,3])
tabl <- rbind(tabl, tabl[1,] + tabl[2,] + tabl[3,])
colnames(tabl) <- c("Y<y", "Y=y", "Y>y", "Total")
rownames(tabl) <- c("X<x", "X=x", "X>x", "Total")
knitr::kable(tabl)
Y<y | Y=y | Y>y | Total | |
---|---|---|---|---|
X<x | 1256 | 0 | 3744 | 5000 |
X=x | 0 | 0 | 0 | 0 |
X>x | 1244 | 0 | 3756 | 5000 |
Total | 2500 | 0 | 7500 | 10000 |
3754/10000
## [1] 0.3754
((5000)/10000)*(7500/10000)
## [1] 0.375
From above we can say that P(X>x and Y>y)=P(X>x)P(Y>y)
fisher.test(table(X>x,Y>y))
##
## Fisher's Exact Test for Count Data
##
## data: table(X > x, Y > y)
## p-value = 0.7995
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9242273 1.1100187
## sample estimates:
## odds ratio
## 1.012883
The p-value is greater than zero indicates events are independent.
The Chi Square Test
chisq.test(table(X>x,Y>y))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(X > x, Y > y)
## X-squared = 0.064533, df = 1, p-value = 0.7995
The p-value is greeter than zero indicates events are independent.
1] Read the data
train <- read.csv("train.csv")
test <- read.csv("test.csv")
hist(train$SalePrice, main="Distribution of SalePrice",xlab="Sales Price")
Sales Price is normally distributed with right skewed.
barplot(table(train$SaleCondition), main="Sale Condition")
Most of the houses being sold are of normal Sale condition
pairs(train[,c("SalePrice","LotArea","TotalBsmtSF")])
From the scatter plot we can see that LotArea and Total Basement sft are positively correlated with Sale Price.
cormat <- cor(train[,c("SalePrice","LotArea","TotalBsmtSF")])
cormat
## SalePrice LotArea TotalBsmtSF
## SalePrice 1.0000000 0.2638434 0.6135806
## LotArea 0.2638434 1.0000000 0.2608331
## TotalBsmtSF 0.6135806 0.2608331 1.0000000
SalePrice is positively correlated with LotArea and TotalBsmTSF.
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
SalePrice vs LotArea
Null Hypothesis: There is no correlation between LotArea and SalePrice Alternative Hypothesis: There exists correlation between LotArea and SalePrice
cor.test(train$SalePrice, train$LotArea, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: train$SalePrice and train$LotArea
## t = 10.445, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
Since P value is less than 0.05 we need to reject null hypothesis and conclude that there exists a correlation between Sales price and Lot area. 80 percent confidence interval of the test is 0.23 - 0.29
Null Hypothesis: There is no corrleation between TotalBsmtSF and SalePrice Alternative Hypothesis: There exists correlation between TotalBsmtSF and SalePrice
cor.test(train$SalePrice, train$TotalBsmtSF, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: train$SalePrice and train$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
Since the p value of the test is less than 0.05 we reject the null hypothesis and conclude that there is correlation between TotalBsmtSF and SalePrice
80 percent confidence interval of the test is 0.5792077 - 0.6239328
precision_mat <- solve(cormat)
cor_prec <- cormat %*% precision_mat
cor_prec
## SalePrice LotArea
## SalePrice 0.99999999999999988897770 0.00000000000000000000000
## LotArea -0.00000000000000002775558 1.00000000000000022204460
## TotalBsmtSF 0.00000000000000000000000 0.00000000000000005551115
## TotalBsmtSF
## SalePrice 0.00000000000000000000000
## LotArea 0.00000000000000005551115
## TotalBsmtSF 0.99999999999999988897770
prec_cor <- precision_mat %*% cormat
prec_cor
## SalePrice LotArea
## SalePrice 1.00000000000000000000000 0.00000000000000000000000
## LotArea 0.00000000000000004163336 1.00000000000000022204460
## TotalBsmtSF -0.00000000000000022204460 -0.00000000000000005551115
## TotalBsmtSF
## SalePrice 0.00000000000000022204460
## LotArea 0.00000000000000008326673
## TotalBsmtSF 0.99999999999999977795540
lu(cormat)
## $L
## SalePrice LotArea TotalBsmtSF
## SalePrice 1.0000000 0.0000000 0
## LotArea 0.2638434 1.0000000 0
## TotalBsmtSF 0.6135806 0.1063472 1
##
## $U
## SalePrice LotArea TotalBsmtSF
## SalePrice 1 0.2638434 0.61358055
## LotArea 0 0.9303867 0.09894398
## TotalBsmtSF 0 0.0000000 0.61299649
(fd <- fitdistr(train$LotArea, "exponential"))
## rate
## 0.000095085704
## (0.000002488507)
fd$estimate
## rate
## 0.0000950857
values <- rexp(10000, rate = fd$estimate)
par(mfrow=c(1,2))
hist(train$LotArea, breaks=100, prob=TRUE, xlab="Lot Area",
main="Lot Area Dist")
hist(values, breaks=100, prob=TRUE, xlab="Simulation",
main="Exp Dist")
Lot Area fits a exponential distribution.
Fn <- ecdf(values)
values[Fn(values)==0.05]
## [1] 534.3741
values[Fn(values)==0.95]
## [1] 31856.83
t.test(values)$conf.int
## [1] 10334.69 10747.85
## attr(,"conf.level")
## [1] 0.95
t.test(train$LotArea)$conf.int
## [1] 10004.42 11029.24
## attr(,"conf.level")
## [1] 0.95
#1] Find out variables with missing values
names(train[, colSums(is.na(train)) > 0])
## [1] "LotFrontage" "Alley" "MasVnrType" "MasVnrArea"
## [5] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1"
## [9] "BsmtFinType2" "Electrical" "FireplaceQu" "GarageType"
## [13] "GarageYrBlt" "GarageFinish" "GarageQual" "GarageCond"
## [17] "PoolQC" "Fence" "MiscFeature"
#2] Remove unnecessary variables
train <-train[, !colnames(train) %in% names(train[, colSums(is.na(train)) > 0])]
test <- test[, !colnames(test) %in% names(train[, colSums(is.na(train)) > 0])]
#3] Create dummy variables
train <- train%>%
mutate_if(is.character, as.factor)%>%
mutate_if(is.factor, as.integer)
test <- test %>%
mutate_if(is.character, as.factor)%>%
mutate_if(is.factor, as.integer)
#4] Clean Dataframe
train <- na.omit(train)
#6] Log transform sales price
train$SalePrice = log(train$SalePrice)
#5] Fit model
fit <- lm(SalePrice~., data = train)
plot(fit)
## Warning: not plotting observations with leverage one:
## 945
## Warning: not plotting observations with leverage one:
## 945
Residuals plot indicates that assumptions of multiple regression model are not satisfied. The residuals are not normally distributed and they are heteroscedastic. There are also possible outliers and high leverage points which can be corrected. In short this is not a very good model filt and there is a lot of chance for improvement
predicted <- predict(fit, test)
## Warning in predict.lm(fit, test): prediction from a rank-deficient fit may
## be misleading
Username sachid . Final score 0.20829.
I am not doing a very good job in terms of model score. This justifies the above comment of poor model fit. There is a lot of chance to improve this model which will boost up the score eventually.