#install.packages('caret')
library(caret)
#install.packages('rmarkdown')
library(rmarkdown)
#install.packages('knitr')
library(knitr)
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \((N+1)/2\).
N<-6
X<-runif(10000,min=0,max=N)
hist(X)
mymean<-(N+1)/2
Y<-rnorm(10000, mean=mymean)
hist(Y)
min(Y)
## [1] -1.04574
max(Y)
## [1] 7.675061
Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quantile of the Y variable. Interpret the meaning of all probabilities.
5 points a. \(P(X>x|X>y)\) b. \(P(X>x, Y>y)\) c. \(P(X<x | X>y)\)
Solution:
\(P(X>x|X>y)=0.941\), which means that if X is bigger than y (1st quartile of normal distribution), then probability of it to be bigger than the median is 0.941.
x<-median(X)
x
## [1] 3.004794
y<- quantile(Y, c(0.25), type = 7)
y
## 25%
## 2.816164
length(X[X>x&X>y])/length(X[X>y])
## [1] 0.9437524
Solution:
\(P(X>x, Y>y)=0.375\), which means that if we pull random X and Y, probability that X is bigger than the median and Y is bigger than the 1st quantile is 0.375.
pr<-0
step<-0.01
for (i in seq(min(X)+step,max(X)+step,by=step)){
for (j in seq(min(Y)+step,max(Y)+step,by=step)){
temp<-ifelse(i>x&j>y,(length(X[X>=(i-step)&X<i])/length(X))*(length(Y[Y>=(j-step)&Y<j])/length(Y)),0)
pr<-temp+pr
}
}
pr
## 25%
## 0.3763762
Solution:
\(P(X<x|X>y)=0.059\), which means that if X is bigger than y (1st quantile of normal distribution), then probability of it to be less than the median is 0.059.
length(X[X<x&X>y])/length(X[X>y])
## [1] 0.05624764
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.
Solution:
pr1<-0
pr2<-0
pr3<-0
pr4<-0
pr5<-0
pr6<-0
pr7<-0
pr8<-0
pr9<-1
step<-0.03
for (i in seq(min(X)+step,max(X)+step,by=step)){
for (j in seq(min(Y)+step,max(Y)+step,by=step)){
temp1<-ifelse(i<=x&j<=y,(length(X[X>=(i-step)&X<i])/length(X))*(length(Y[Y>=(j-step)&Y<j])/length(Y)),0)
pr1<-temp1+pr1
temp2<-ifelse(i<=x&j>y,(length(X[X>=(i-step)&X<i])/length(X))*(length(Y[Y>=(j-step)&Y<j])/length(Y)),0)
pr2<-temp2+pr2
temp3<-ifelse(i>x&j<=y,(length(X[X>=(i-step)&X<i])/length(X))*(length(Y[Y>=(j-step)&Y<j])/length(Y)),0)
pr4<-temp3+pr4
temp4<-ifelse(i>x&j>y,(length(X[X>=(i-step)&X<i])/length(X))*(length(Y[Y>=(j-step)&Y<j])/length(Y)),0)
pr5<-temp4+pr5
pr3<-temp1+temp2+pr3
pr6<-temp3+temp4+pr6
pr7<-temp1+temp3+pr7
pr8<-temp2+temp4+pr8
}
}
ftbl = data.frame(
PrYle = c(pr1,pr4,pr7),
PrYlgt = c(pr2,pr5,pr8),
marginal = c(pr3,pr6,pr9))
rownames(ftbl) <- c('PrXle', 'PrXqt','Marginal')
ftbl
## PrYle PrYlgt marginal
## PrXle 0.1209833 0.3779167 0.4989
## PrXqt 0.1215167 0.3795832 0.5011
## Marginal 0.2425000 0.7575000 1.0000
ftbl[2,3]*ftbl[3,2]
## [1] 0.3795832
\(P(X>x and Y>y)=0.375=P(X>x)P(Y>y)=0.50*0.75=0.375\)
Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?
Solution:
Fisher’s test produces p-value of 1 which confirms independence.
Chi-squire test produces p-value of 1 that confirms independence.
As we have so many data points. 10000 for X and Y, or 10000000 all togehter considering that for a single x point there could be 10000 Y points and vice verse, chi-squire would be more appropriate.
#install.packages("rcompanion")
library(rcompanion)
ftbl1<-ftbl[1:2,1:2]*100000000
ftbl1
## PrYle PrYlgt
## PrXle 12098325 37791675
## PrXqt 12151675 37958325
fisher.test(as.matrix(ftbl1))
##
## Fisher's Exact Test for Count Data
##
## data: as.matrix(ftbl1)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9990723 1.0009286
## sample estimates:
## odds ratio
## 1
chisq.test(as.matrix(ftbl1))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: as.matrix(ftbl1)
## X-squared = 2.8551e-25, df = 1, p-value = 1
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
getwd()
## [1] "C:/HW"
mydata<-read.csv('train.csv')
#dim(mydatatr)
#mydatat<-read.csv('test.csv')
#mydatat$SalePrice<-1
#mydata<-rbind(mydatatr,mydatat)
#head(mydata)
#summary(mydata)
mydata$MSZoning[is.na(mydata$MSZoning)]<-'RL'
mydata$KitchenQual[is.na(mydata$KitchenQual)]<-'TA'
mydata$SalePrice<-mydata$SalePrice^0.4
m<-mean(mydata$LotFrontage,na.rm = TRUE)
mydata$Alley<-NULL
mydata$LotFrontage[is.na(mydata$LotFrontage)]<-m
mydata$LotFrontage<-mydata$LotFrontage^(-0.05)
mydata$LotArea<-mydata$LotArea^(-0.1)
mydata$YearBuilt2<-mydata$YearBuilt^2
mydata$TotalBsmtSF2<-mydata$TotalBsmtSF^2
mydata$TotalBsmtSF3<-mydata$TotalBsmtSF^3
#summary(lm(SalePrice~FireplaceQu,data=mydata))
mydata$MasVnrType<-as.character(mydata$MasVnrType)
mydata$MasVnrType[is.na(mydata$MasVnrType)]<-'Unk'
mydata$MasVnrType<-as.factor(mydata$MasVnrType)
mydata$MasVnrArea[is.na(mydata$MasVnrArea)]<-mean(mydata$MasVnrArea,na.rm = TRUE)
mydata$BsmtQual<-as.character(mydata$BsmtQual)
mydata$BsmtQual[is.na(mydata$BsmtQual)]<-'Unk'
mydata$BsmtQual<-as.factor(mydata$BsmtQual)
mydata$BsmtExposure<-as.character(mydata$BsmtExposure)
mydata$BsmtExposure[is.na(mydata$BsmtExposure)]<-'Unk'
mydata$BsmtExposure<-as.factor(mydata$BsmtExposure)
mydata$BsmtFinType1<-as.character(mydata$BsmtFinType1)
mydata$BsmtFinType1[is.na(mydata$BsmtFinType1)]<-'Unk'
mydata$BsmtFinType1<-as.factor(mydata$BsmtFinType1)
mydata$BsmtFinType2<-NULL
mydata$Electrical<-NULL
mydata$FireplaceQu<-as.character(mydata$FireplaceQu)
mydata$FireplaceQu[is.na(mydata$FireplaceQu)]<-'Unk'
mydata$FireplaceQu<-as.factor(mydata$FireplaceQu)
mydata$Fence<-as.character(mydata$Fence)
mydata$Fence[is.na(mydata$Fence)]<-'Unk'
mydata$Fence<-as.factor(mydata$Fence)
m<-mean(mydata$GarageYrBlt,na.rm = TRUE)
mydata$GarageYrBlt[is.na(mydata$GarageYrBlt)]<-m
plot(mydata$LotArea,log(mydata$SalePrice))
plot(mydata$BsmtUnfSF,log(mydata$SalePrice))
library(dplyr)
library(tidyr)
library(ggplot2)
library(rvest)
library(knitr)
library(rcompanion)
library(MASS)
library(tidyverse)
library(caret)
mydata%>%dplyr::select_if(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()
cmat<-mydata %>%dplyr::select(LotFrontage,LotArea,X1stFlrSF) %>%
cor(use = "complete.obs")
cmat
## LotFrontage LotArea X1stFlrSF
## LotFrontage 1.0000000 0.6760210 -0.3924063
## LotArea 0.6760210 1.0000000 -0.4407445
## X1stFlrSF -0.3924063 -0.4407445 1.0000000
cor.test(mydata$LotFrontage,mydata$LotArea,use = "complete.obs",method = "pearson",conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: mydata$LotFrontage and mydata$LotArea
## t = 35.03, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6573741 0.6938405
## sample estimates:
## cor
## 0.676021
cor.test(mydata$LotFrontage,mydata$X1stFlrSF,use = "complete.obs",method = "pearson",conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: mydata$LotFrontage and mydata$X1stFlrSF
## t = -16.29, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.4204309 -0.3636336
## sample estimates:
## cor
## -0.3924063
cor.test(mydata$LotArea,mydata$X1stFlrSF,use = "complete.obs",method = "pearson",conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: mydata$LotArea and mydata$X1stFlrSF
## t = -18.749, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.4673924 -0.4132964
## sample estimates:
## cor
## -0.4407445
hist(mydata$X1stFlrSF)
plot(log(mydata$GarageArea),log(mydata$X1stFlrSF))
plot(log(mydata$GrLivArea),log(mydata$SalePrice))
mydata$OverallQual<-as.factor(mydata$OverallQual)
mydata$OverallCond<-as.factor(mydata$OverallCond)
mydata$GarageCars<-as.factor(mydata$GarageCars)
mydata$Fireplaces<-as.factor(mydata$Fireplaces)
summary(lm(log(SalePrice)~log(mydata$GrLivArea),data=mydata))
##
## Call:
## lm(formula = log(SalePrice) ~ log(mydata$GrLivArea), data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54135 -0.05704 0.01146 0.06634 0.34551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.267250 0.062355 36.36 <2e-16 ***
## log(mydata$GrLivArea) 0.349814 0.008571 40.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1092 on 1458 degrees of freedom
## Multiple R-squared: 0.5333, Adjusted R-squared: 0.533
## F-statistic: 1666 on 1 and 1458 DF, p-value: < 2.2e-16
mydata$GrLivArea<-log(mydata$GrLivArea)
mydata$X1stFlrSF<-log(mydata$X1stFlrSF)
Yvl= transformTukey(mydata$X1stFlrSF,plotit=FALSE,returnLambda = TRUE)
##
## lambda W Shapiro.p.value
## 412 0.275 0.9961 0.0007968
##
## if (lambda > 0){TRANS = x ^ lambda}
## if (lambda == 0){TRANS = log(x)}
## if (lambda < 0){TRANS = -1 * x ^ lambda}
Yvl
## lambda
## 0.275
boxplot(log(SalePrice)~mydata$Exterior1st,data=mydata)
boxplot(log(SalePrice)~Exterior1st,data=mydata)
summary(lm(log(SalePrice)~Exterior1st,data=mydata))
##
## Call:
## lm(formula = log(SalePrice) ~ Exterior1st, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.66527 -0.08283 -0.00415 0.07823 0.67305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.609806 0.032479 141.931 < 2e-16 ***
## Exterior1stAsphShn -0.004636 0.148838 -0.031 0.975158
## Exterior1stBrkComm -0.146491 0.107721 -1.360 0.174071
## Exterior1stBrkFace 0.225450 0.038430 5.867 5.51e-09 ***
## Exterior1stCBlock 0.014880 0.148838 0.100 0.920376
## Exterior1stCemntBd 0.272163 0.037427 7.272 5.79e-13 ***
## Exterior1stHdBoard 0.170954 0.033911 5.041 5.21e-07 ***
## Exterior1stImStucc 0.380634 0.148838 2.557 0.010648 *
## Exterior1stMetalSd 0.134101 0.033923 3.953 8.09e-05 ***
## Exterior1stPlywood 0.206682 0.035359 5.845 6.24e-09 ***
## Exterior1stStone 0.372809 0.107721 3.461 0.000554 ***
## Exterior1stStucco 0.146991 0.043575 3.373 0.000762 ***
## Exterior1stVinylSd 0.272543 0.033104 8.233 4.04e-16 ***
## Exterior1stWd Sdng 0.125597 0.034019 3.692 0.000231 ***
## Exterior1stWdShing 0.128011 0.043201 2.963 0.003095 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1453 on 1445 degrees of freedom
## Multiple R-squared: 0.1815, Adjusted R-squared: 0.1736
## F-statistic: 22.89 on 14 and 1445 DF, p-value: < 2.2e-16
mydata$Neighborhood<-ifelse(mydata$Neighborhood=='IDOTRR'|mydata$Neighborhood=='BrDale'|mydata$Neighborhood=='MeadowV','Bad',
ifelse(mydata$Neighborhood=='CollgCr'|mydata$Neighborhood=='Crawfor'|mydata$Neighborhood=='Gilbert','Ave',
ifelse(mydata$Neighborhood=='NoRidge'|mydata$Neighborhood=='NridgHt'|mydata$Neighborhood=='StoneBr','Good',
ifelse(mydata$Neighborhood=='BrkSide'|mydata$Neighborhood=='OldTown'|mydata$Neighborhood=='Edwards'|mydata$Neighborhood=='SWISU','Bad1',
ifelse(mydata$Neighborhood=='Blueste'|mydata$Neighborhood=='Mitchel'|mydata$Neighborhood=='Ames'|mydata$Neighborhood=='NPkVill'|mydata$Neighborhood=='Sawyer','Bad2',
ifelse(mydata$Neighborhood=='Timber'|mydata$Neighborhood=='Veenker','Soso','Bad3'))))))
mydata$Neighborhood<-as.factor(mydata$Neighborhood)
mydata$Condition1<-ifelse(mydata$Condition1=='Feedr'|mydata$Condition1=='RRAe','Bad','Good')
mydata$OverallQual<-ifelse(mydata$OverallQual=='2','1',mydata$OverallQual)
mydata$OverallQual<-as.factor(mydata$OverallQual)
mydata$Neighborhood<-as.factor(mydata$Neighborhood)
mydata$Condition1<-as.factor(mydata$Condition1)
For all 3 pairs Null Hypotesis was rejected. Our confidence interval indicates that there are 80% probability that correlations lies in our confidence interval.
P values are very small so it is very unlikely that Type I error can happen (our asumption that there is correlation is wrong). Family wise error possbilty does not seem to be a serious issue. Our confidence intervals have only 80% chance to be correct. I would prefer to use at least 90% or 95% intervals.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#install.packages("matlib")
library(matlib)
PM <- inv(as.matrix(cmat))
PM
##
## [1,] 1.8799706 -1.1737682 0.2203804
## [2,] -1.1737682 1.9739361 0.4094075
## [3,] 0.2203804 0.4094075 1.2669228
NM<-as.matrix(cmat)%*%PM%*%as.matrix(cmat)
NM
## LotFrontage LotArea X1stFlrSF
## LotFrontage 1.0000000 0.6760210 -0.3924063
## LotArea 0.6760210 1.0000000 -0.4407445
## X1stFlrSF -0.3924063 -0.4407445 1.0000000
\(NM=LU\)
#install.packages("matrixcalc")
library(matrixcalc)
luA <- lu.decomposition(NM)
luA$L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.6760210 1.0000000 0
## [3,] -0.3924063 -0.3231511 1
luA$U
## [,1] [,2] [,3]
## [1,] 1 0.6760210 -0.3924063
## [2,] 0 0.5429956 -0.1754696
## [3,] 0 0.0000000 0.7893141
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
hist(mydata$GrLivArea)
summary(mydata$GrLivArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.811 7.030 7.289 7.268 7.483 8.638
library(MASS)
edist<-fitdistr(mydata$GrLivArea,"exponential")
edist$estimate
## rate
## 0.1375937
X<-rexp(1000,rate=edist$estimate)
hist(X)
qexp(0.05,edist$estimate)
## [1] 0.3727881
qexp(0.95,edist$estimate)
## [1] 21.77231
smean<-mean(mydata$GrLivArea)
ssd<-sd(mydata$GrLivArea)
n<-length(mydata$GrLivArea)
ciLB<-smean-qnorm(0.975)*ssd/sqrt(n)
ciUB<-smean+qnorm(0.975)*ssd/sqrt(n)
ciLB
## [1] 7.250665
ciUB
## [1] 7.284884
quantile(mydata$GrLivArea,c(0.05,0.95))
## 5% 95%
## 6.742881 7.810393
#install.packages("fitdistrplus")
library(fitdistrplus)
fit.gamma <- fitdist(mydata$GrLivArea, distr = "gamma", method = "mle")
summary(fit.gamma)
## Fitting of the distribution ' gamma ' by maximum likelihood
## Parameters :
## estimate Std. Error
## shape 473.46761 17.517168
## rate 65.14646 2.411536
## Loglikelihood: -469.5322 AIC: 943.0643 BIC: 953.6367
## Correlation matrix:
## shape rate
## shape 1.000000 0.999472
## rate 0.999472 1.000000
plot(fit.gamma)
Our data is not truly exponential, so exponential fit is somewhat off. Gamma might be better approximation. Confidence interval tells us that we are 95% confident that population mean falls into the interval.
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
getwd()
## [1] "C:/HW"
mydata<-read.csv('train.csv')
mydatat<-read.csv('test.csv')
#dim(mydatat)
mydatat$SalePrice<-1
mydata$flag<-'train'
mydatat$flag<-'test'
mydata<-rbind(mydata,mydatat)
#dim(mydatat)
#head(mydata)
#summary(mydata%>%filter(flag=='train'))
#describe(mydata)
sprice<-mydata%>%filter(flag=='train')%>%dplyr::select(SalePrice)
spricev<-as.numeric(sprice$SalePrice)
#summary(spricev)
#typeof(spricev)
out<-MASS::boxcox(as.numeric(spricev)~1)
range(out$x[out$y > max(out$y)-qchisq(0.95,1)/2])
## [1] -0.14141414 -0.02020202
hist(spricev^(-0.08))
#mydata%>%filter(flag=='train')%>%arrange(SalePrice)
boxplot(spricev^(-0.08))
mydata$SalePriceN<-mydata$SalePrice^(-0.08)
mydata1<-mydata
#%>%filter(flag=='train')
#smp_size <- floor(0.75 * nrow(mydata1))
## set the seed to make your partition reproducible
#set.seed(123)
#train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
#summary(mydata1$GarageArea)
mydata1$KitchenQual[is.na(mydata1$KitchenQual)]<-'TA'
mydata1$GarageArea[is.na(mydata1$GarageArea)]<-0
mydata1%>%filter(is.na(MSZoning))
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1916 30 <NA> 109 21780 Grvl <NA> Reg
## 2 2217 20 <NA> 80 14584 Pave <NA> Reg
## 3 2251 70 <NA> NA 56600 Pave <NA> IR1
## 4 2905 20 <NA> 125 31250 Pave <NA> Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl <NA> Inside Gtl IDOTRR Norm
## 2 Low AllPub Inside Mod IDOTRR Norm
## 3 Low AllPub Inside Gtl IDOTRR Norm
## 4 Lvl AllPub Inside Gtl Mitchel Artery
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 2 4 1910
## 2 Norm 1Fam 1Story 1 5 1952
## 3 Norm 1Fam 2.5Unf 5 1 1900
## 4 Norm 1Fam 1Story 1 3 1951
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1950 Gable CompShg Wd Sdng Wd Sdng None
## 2 1952 Gable CompShg AsbShng VinylSd None
## 3 1950 Hip CompShg Wd Sdng Wd Sdng None
## 4 1951 Gable CompShg CBlock VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 Fa Fa CBlock <NA> <NA> <NA>
## 2 0 Fa Po Slab <NA> <NA> <NA>
## 3 0 TA TA BrkTil TA TA No
## 4 0 TA Fa CBlock <NA> <NA> <NA>
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 <NA> 0 <NA> 0 0 0
## 2 <NA> 0 <NA> 0 0 0
## 3 Unf 0 Unf 0 686 686
## 4 <NA> 0 <NA> 0 0 0
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA N FuseA 810 0 0
## 2 Wall Po N FuseA 733 0 0
## 3 GasA Ex Y SBrkr 1150 686 0
## 4 GasA TA Y FuseA 1600 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 810 0 0 1 0 1
## 2 733 0 0 1 0 2
## 3 1836 0 0 2 0 4
## 4 1600 0 0 1 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 4 Min1 0 <NA>
## 2 1 Fa 4 <NA> 0 <NA>
## 3 1 TA 7 Maj1 0 <NA>
## 4 1 TA 6 Mod 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Detchd 1975 Unf 1 280 TA
## 2 Attchd 1952 Unf 2 487 Fa
## 3 Detchd 1900 Unf 1 288 TA
## 4 Attchd 1951 Unf 1 270 Fa
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA N 119 24 0 0
## 2 Po N 0 0 0 0
## 3 Fa N 0 0 0 0
## 4 TA N 0 0 135 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 3 2009
## 2 0 0 <NA> <NA> <NA> 0 2 2008
## 3 0 0 <NA> <NA> <NA> 0 1 2008
## 4 0 0 <NA> <NA> <NA> 0 5 2006
## SaleType SaleCondition SalePrice flag SalePriceN
## 1 ConLD Normal 1 test 1
## 2 WD Abnorml 1 test 1
## 3 WD Normal 1 test 1
## 4 WD Normal 1 test 1
mydata$MSZoning[is.na(mydata$MSZoning)]<-'RL'
mydata1$LotFrontage[is.na(mydata1$LotFrontage)]<-0
mydata$LotFrontage[is.na(mydata$LotFrontage)]<-0
mydata1$Alley<-as.character(mydata1$Alley)
mydata1$Alley[is.na(mydata1$Alley)]<-'M'
mydata1$Alley<-as.factor(mydata1$Alley)
mydata1$MasVnrType<-as.character(mydata1$MasVnrType)
mydata1$MasVnrType[is.na(mydata1$MasVnrType)]<-'M'
mydata1$MasVnrType<-as.factor(mydata1$MasVnrType)
mydata1$BsmtQual<-as.character(mydata1$BsmtQual)
mydata1$BsmtQual[is.na(mydata1$BsmtQual)]<-'M'
mydata1$BsmtQual<-as.factor(mydata1$BsmtQual)
mydata1$BsmtCond<-as.character(mydata1$BsmtCond)
mydata1$BsmtCond[is.na(mydata1$BsmtCond)]<-'M'
mydata1$BsmtCond<-as.factor(mydata1$BsmtCond)
mydata1$Condition1N<-as.factor(ifelse(mydata1$Condition1=='PosN','P','NP'))
mydata1$HouseStyleN<-as.factor(ifelse(mydata1$HouseStyle=='2.5Fin'|mydata1$HouseStyle=='2Story','2FS',mydata1$HouseStyle))
mydata1$Neighborhood1<-ifelse(mydata1$Neighborhood=='NoRidge'|mydata1$Neighborhood=='NridgHt','NN1',ifelse(mydata1$Neighborhood=='Gilbert'|mydata1$Neighborhood=='CollgCr'|mydata1$Neighborhood=='Blmngtn'|mydata1$Neighborhood=='Crawfor'|mydata1$Neighborhood=='NWAmes','GCB',ifelse(mydata1$Neighborhood=='NAmes'|mydata1$Neighborhood=='NPkVill','NN2',ifelse(mydata1$Neighborhood=='Blueste'|mydata1$Neighborhood=='SWISU','BS',ifelse(mydata1$Neighborhood=='Timber'|mydata1$Neighborhood=='Veenker'|mydata1$Neighborhood=='Somerst','TV',ifelse(mydata1$Neighborhood=='Edwards'|mydata1$Neighborhood=='OldTown','EO',mydata1$Neighborhood))))))
mydata1$Neighborhood<-NULL
mydata1$Neighborhood1<-as.factor(mydata1$Neighborhood1)
#mydata1 %>% filter(flag=='train')%>%dplyr::group_by(Neighborhood) %>% dplyr::summarise(Average=round(mean(SalePriceN),3))%>%arrange(Average)
mydata1 %>%filter(flag=='train')%>% dplyr::group_by(Neighborhood1) %>% dplyr::summarise(Average=round(mean(SalePriceN),3))%>%arrange(Average)
## # A tibble: 15 x 2
## Neighborhood1 Average
## <fct> <dbl>
## 1 NN1 0.364
## 2 22 0.366
## 3 TV 0.373
## 4 5 0.376
## 5 GCB 0.378
## 6 20 0.38
## 7 12 0.385
## 8 NN2 0.387
## 9 BS 0.388
## 10 19 0.389
## 11 EO 0.392
## 12 4 0.393
## 13 3 0.397
## 14 11 0.399
## 15 10 0.4
#BsmtExposure
mydata1$BsmtExposure<-as.character(mydata1$BsmtExposure)
mydata1$BsmtExposure[is.na(mydata1$BsmtExposure)]<-'M'
mydata1$BsmtExposure<-as.factor(mydata1$BsmtExposure)
mydata1$BsmtFinType1<-as.character(mydata1$BsmtFinType1)
mydata1$BsmtFinType1[is.na(mydata1$BsmtFinType1)]<-'M'
mydata1$BsmtFinType1<-as.factor(mydata1$BsmtFinType1)
mydata1$BsmtFinType2<-as.character(mydata1$BsmtFinType2)
mydata1$BsmtFinType2[is.na(mydata1$BsmtFinType2)]<-'M'
mydata1$BsmtFinType2<-as.factor(mydata1$BsmtFinType2)
mydata1$Electrical<-as.character(mydata1$Electrical)
mydata1$Electrical[is.na(mydata1$Electrical)]<-'M'
mydata1$Electrical<-as.factor(mydata1$Electrical)
mydata1$BsmtFullBath<-as.character(mydata1$BsmtFullBath)
mydata1$BsmtFullBath<-as.factor(mydata1$BsmtFullBath)
mydata1$BsmtHalfBath<-as.character(mydata1$BsmtHalfBath)
mydata1$BsmtHalfBath<-as.factor(mydata1$BsmtHalfBath)
mydata1$FullBath<-as.character(mydata1$FullBath)
mydata1$FullBath<-as.factor(mydata1$FullBath)
mydata1$HalfBath<-as.character(mydata1$HalfBath)
mydata1$HalfBath<-as.factor(mydata1$HalfBath)
mydata1$KitchenAbvGr<-as.character(mydata1$KitchenAbvGr)
mydata1$KitchenAbvGr<-as.factor(mydata1$KitchenAbvGr)
mydata1$FireplaceQu<-as.character(mydata1$FireplaceQu)
mydata1$FireplaceQu[is.na(mydata1$FireplaceQu)]<-'M'
mydata1$FireplaceQu<-as.factor(mydata1$FireplaceQu)
mydata1$GarageType<-as.character(mydata1$GarageType)
mydata1$GarageType[is.na(mydata1$GarageType)]<-'M'
mydata1$GarageType<-as.factor(mydata1$GarageType)
mydata1$MiscFeature<-as.character(mydata1$MiscFeature)
mydata1$MiscFeature[is.na(mydata1$MiscFeature)]<-'M'
mydata1$MiscFeature<-as.factor(mydata1$MiscFeature)
mydata1$Fence<-as.character(mydata1$Fence)
mydata1$Fence[is.na(mydata1$Fence)]<-'M'
mydata1$Fence<-as.factor(mydata1$Fence)
mydata1$GarageCars<-as.character(mydata1$GarageCars)
mydata1$GarageCars[is.na(mydata1$GarageCars)]<-'M'
mydata1$GarageCars<-as.factor(mydata1$GarageCars)
typeof(mydata1$Neighborhood1)
## [1] "integer"
mydata1$c<-as.character(mydata1$OverallCond)
mydata1$c1<-as.character(mydata1$OverallQual)
#mydata1$OverallCond<-NULL
#mydata1$OverallQual<-NULL
mydata1$YearBuilt<-mydata1$YearBuilt-1872
mydata1$YearRemodAdd<-mydata1$YearRemodAdd-1872
mydata1$MasVnrArea[is.na(mydata1$MasVnrArea)]<-0
mydata1$YrSold<-mydata1$YrSold-2006
mydata1$YrSold<-as.character(mydata1$YrSold)
mydata1$YrSold<-as.factor(mydata1$YrSold)
mydata1$MoSold<-as.character(mydata1$MoSold)
mydata1$MoSold<-as.factor(mydata1$MoSold)
mydata1$MiscFeature<-ifelse(mydata1$MiscFeature!='M','Y','M')
mydata1$GarageYrBlt[is.na(mydata1$GarageYrBlt)]<-0
#ifelse(mydata1$Neighborhood1=='Blmngt'|mydata1$Neighborhood=='CollgCr','BC',ifelse(mydata1$Neighborhood=='NAmes'|mydata1$Neighborhood=='NpkVill','NN',mydata1$Neighborhood)))
mydata1$LotArea<-mydata1$LotArea^0.04
mydata1$YearBuilt<-mydata1$YearBuilt^1.5
#dim(mydata1)
mydata1$c1<-ifelse(mydata1$c1=='1'|mydata1$c1=='2','N12',mydata1$c1)
mydata1$c<-as.factor(mydata1$c)
mydata1$c1<-as.factor(mydata1$c1)
##Outlier
mydata2<-mydata1%>%dplyr::filter(flag=='train')
mydata2<-mydata2[-1299,]
#dim(mydata2)
#colnames(mydata2)
mydata3<-mydata1%>%dplyr::filter(flag=='test')
tmod<-lm(SalePriceN~LotArea+OverallCond+YearBuilt+I(BsmtFinSF1^.2)+TotalBsmtSF+CentralAir+poly(GrLivArea,1)+KitchenQual+Fireplaces+I(GarageArea^.5)+c1:Neighborhood1,data=mydata2)
summary(tmod)
##
## Call:
## lm(formula = SalePriceN ~ LotArea + OverallCond + YearBuilt +
## I(BsmtFinSF1^0.2) + TotalBsmtSF + CentralAir + poly(GrLivArea,
## 1) + KitchenQual + Fireplaces + I(GarageArea^0.5) + c1:Neighborhood1,
## data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.016172 -0.001959 -0.000150 0.001721 0.031350
##
## Coefficients: (61 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.565e-01 7.064e-03 64.621 < 2e-16 ***
## LotArea -4.147e-02 4.735e-03 -8.758 < 2e-16 ***
## OverallCond -1.666e-03 1.137e-04 -14.650 < 2e-16 ***
## YearBuilt -6.260e-06 5.051e-07 -12.394 < 2e-16 ***
## I(BsmtFinSF1^0.2) -6.130e-04 6.680e-05 -9.176 < 2e-16 ***
## TotalBsmtSF -3.057e-06 3.398e-07 -8.998 < 2e-16 ***
## CentralAirY -2.266e-03 5.023e-04 -4.511 7.00e-06 ***
## poly(GrLivArea, 1) -1.478e-01 5.818e-03 -25.398 < 2e-16 ***
## KitchenQualFa 3.396e-03 9.161e-04 3.707 0.000218 ***
## KitchenQualGd 1.532e-03 5.396e-04 2.839 0.004591 **
## KitchenQualTA 2.684e-03 5.887e-04 4.560 5.57e-06 ***
## Fireplaces -1.290e-03 1.944e-04 -6.635 4.66e-11 ***
## I(GarageArea^0.5) -1.624e-04 2.062e-05 -7.877 6.79e-15 ***
## c110:Neighborhood110 NA NA NA NA
## c13:Neighborhood110 1.612e-02 2.678e-03 6.021 2.22e-09 ***
## c14:Neighborhood110 2.160e-02 1.887e-03 11.448 < 2e-16 ***
## c15:Neighborhood110 1.496e-02 1.898e-03 7.883 6.46e-15 ***
## c16:Neighborhood110 1.234e-02 2.026e-03 6.090 1.46e-09 ***
## c17:Neighborhood110 7.222e-03 3.062e-03 2.358 0.018490 *
## c18:Neighborhood110 NA NA NA NA
## c19:Neighborhood110 NA NA NA NA
## c1N12:Neighborhood110 4.031e-02 4.102e-03 9.828 < 2e-16 ***
## c110:Neighborhood111 NA NA NA NA
## c13:Neighborhood111 NA NA NA NA
## c14:Neighborhood111 1.867e-02 1.958e-03 9.535 < 2e-16 ***
## c15:Neighborhood111 1.572e-02 2.133e-03 7.367 3.01e-13 ***
## c16:Neighborhood111 1.699e-02 4.066e-03 4.180 3.11e-05 ***
## c17:Neighborhood111 NA NA NA NA
## c18:Neighborhood111 NA NA NA NA
## c19:Neighborhood111 NA NA NA NA
## c1N12:Neighborhood111 NA NA NA NA
## c110:Neighborhood112 NA NA NA NA
## c13:Neighborhood112 NA NA NA NA
## c14:Neighborhood112 1.366e-02 2.626e-03 5.203 2.25e-07 ***
## c15:Neighborhood112 1.394e-02 1.657e-03 8.413 < 2e-16 ***
## c16:Neighborhood112 1.068e-02 1.701e-03 6.276 4.65e-10 ***
## c17:Neighborhood112 8.642e-03 2.080e-03 4.156 3.44e-05 ***
## c18:Neighborhood112 6.055e-03 4.027e-03 1.504 0.132939
## c19:Neighborhood112 NA NA NA NA
## c1N12:Neighborhood112 NA NA NA NA
## c110:Neighborhood119 NA NA NA NA
## c13:Neighborhood119 NA NA NA NA
## c14:Neighborhood119 1.452e-02 1.923e-03 7.551 7.87e-14 ***
## c15:Neighborhood119 1.326e-02 1.543e-03 8.595 < 2e-16 ***
## c16:Neighborhood119 1.272e-02 1.849e-03 6.875 9.37e-12 ***
## c17:Neighborhood119 NA NA NA NA
## c18:Neighborhood119 NA NA NA NA
## c19:Neighborhood119 NA NA NA NA
## c1N12:Neighborhood119 NA NA NA NA
## c110:Neighborhood120 NA NA NA NA
## c13:Neighborhood120 NA NA NA NA
## c14:Neighborhood120 1.726e-02 3.060e-03 5.642 2.04e-08 ***
## c15:Neighborhood120 1.325e-02 1.977e-03 6.702 2.99e-11 ***
## c16:Neighborhood120 1.116e-02 1.622e-03 6.883 8.89e-12 ***
## c17:Neighborhood120 8.246e-03 1.632e-03 5.054 4.92e-07 ***
## c18:Neighborhood120 6.885e-03 2.200e-03 3.130 0.001786 **
## c19:Neighborhood120 NA NA NA NA
## c1N12:Neighborhood120 NA NA NA NA
## c110:Neighborhood122 -2.844e-03 4.039e-03 -0.704 0.481418
## c13:Neighborhood122 NA NA NA NA
## c14:Neighborhood122 NA NA NA NA
## c15:Neighborhood122 NA NA NA NA
## c16:Neighborhood122 NA NA NA NA
## c17:Neighborhood122 6.358e-03 2.597e-03 2.448 0.014476 *
## c18:Neighborhood122 2.386e-03 1.660e-03 1.438 0.150731
## c19:Neighborhood122 -2.761e-03 2.156e-03 -1.281 0.200426
## c1N12:Neighborhood122 NA NA NA NA
## c110:Neighborhood13 NA NA NA NA
## c13:Neighborhood13 NA NA NA NA
## c14:Neighborhood13 NA NA NA NA
## c15:Neighborhood13 1.293e-02 2.280e-03 5.673 1.71e-08 ***
## c16:Neighborhood13 1.519e-02 1.919e-03 7.914 5.09e-15 ***
## c17:Neighborhood13 NA NA NA NA
## c18:Neighborhood13 NA NA NA NA
## c19:Neighborhood13 NA NA NA NA
## c1N12:Neighborhood13 NA NA NA NA
## c110:Neighborhood14 NA NA NA NA
## c13:Neighborhood14 2.095e-02 2.688e-03 7.794 1.28e-14 ***
## c14:Neighborhood14 1.420e-02 1.941e-03 7.314 4.39e-13 ***
## c15:Neighborhood14 1.220e-02 1.719e-03 7.100 2.00e-12 ***
## c16:Neighborhood14 8.681e-03 1.746e-03 4.971 7.49e-07 ***
## c17:Neighborhood14 5.473e-03 2.403e-03 2.277 0.022915 *
## c18:Neighborhood14 NA NA NA NA
## c19:Neighborhood14 NA NA NA NA
## c1N12:Neighborhood14 2.303e-02 3.195e-03 7.208 9.37e-13 ***
## c110:Neighborhood15 NA NA NA NA
## c13:Neighborhood15 NA NA NA NA
## c14:Neighborhood15 9.284e-03 2.624e-03 3.539 0.000416 ***
## c15:Neighborhood15 1.462e-02 2.082e-03 7.025 3.36e-12 ***
## c16:Neighborhood15 9.072e-03 1.860e-03 4.878 1.20e-06 ***
## c17:Neighborhood15 5.487e-03 1.898e-03 2.891 0.003906 **
## c18:Neighborhood15 NA NA NA NA
## c19:Neighborhood15 NA NA NA NA
## c1N12:Neighborhood15 NA NA NA NA
## c110:Neighborhood1BS NA NA NA NA
## c13:Neighborhood1BS NA NA NA NA
## c14:Neighborhood1BS 1.452e-02 3.076e-03 4.721 2.59e-06 ***
## c15:Neighborhood1BS 9.116e-03 1.976e-03 4.614 4.31e-06 ***
## c16:Neighborhood1BS 1.189e-02 1.849e-03 6.432 1.74e-10 ***
## c17:Neighborhood1BS 1.034e-02 2.643e-03 3.912 9.59e-05 ***
## c18:Neighborhood1BS NA NA NA NA
## c19:Neighborhood1BS NA NA NA NA
## c1N12:Neighborhood1BS 2.202e-02 4.158e-03 5.294 1.39e-07 ***
## c110:Neighborhood1EO 1.735e-02 2.628e-03 6.603 5.74e-11 ***
## c13:Neighborhood1EO 1.838e-02 1.908e-03 9.637 < 2e-16 ***
## c14:Neighborhood1EO 1.460e-02 1.611e-03 9.061 < 2e-16 ***
## c15:Neighborhood1EO 1.418e-02 1.545e-03 9.180 < 2e-16 ***
## c16:Neighborhood1EO 1.272e-02 1.574e-03 8.081 1.40e-15 ***
## c17:Neighborhood1EO 1.122e-02 1.673e-03 6.707 2.90e-11 ***
## c18:Neighborhood1EO 9.976e-03 2.673e-03 3.731 0.000198 ***
## c19:Neighborhood1EO 2.205e-03 4.003e-03 0.551 0.581877
## c1N12:Neighborhood1EO 1.759e-02 4.185e-03 4.204 2.80e-05 ***
## c110:Neighborhood1GCB NA NA NA NA
## c13:Neighborhood1GCB NA NA NA NA
## c14:Neighborhood1GCB 8.020e-03 2.647e-03 3.030 0.002491 **
## c15:Neighborhood1GCB 1.186e-02 1.542e-03 7.694 2.72e-14 ***
## c16:Neighborhood1GCB 9.850e-03 1.462e-03 6.738 2.36e-11 ***
## c17:Neighborhood1GCB 7.912e-03 1.435e-03 5.513 4.22e-08 ***
## c18:Neighborhood1GCB 5.303e-03 1.508e-03 3.517 0.000450 ***
## c19:Neighborhood1GCB 3.670e-03 2.310e-03 1.589 0.112367
## c1N12:Neighborhood1GCB NA NA NA NA
## c110:Neighborhood1NN1 7.966e-05 1.747e-03 0.046 0.963638
## c13:Neighborhood1NN1 NA NA NA NA
## c14:Neighborhood1NN1 NA NA NA NA
## c15:Neighborhood1NN1 NA NA NA NA
## c16:Neighborhood1NN1 7.940e-03 3.016e-03 2.633 0.008563 **
## c17:Neighborhood1NN1 6.219e-03 1.576e-03 3.945 8.38e-05 ***
## c18:Neighborhood1NN1 3.929e-03 1.470e-03 2.673 0.007612 **
## c19:Neighborhood1NN1 7.400e-04 1.539e-03 0.481 0.630690
## c1N12:Neighborhood1NN1 NA NA NA NA
## c110:Neighborhood1NN2 NA NA NA NA
## c13:Neighborhood1NN2 1.206e-02 2.703e-03 4.463 8.75e-06 ***
## c14:Neighborhood1NN2 1.350e-02 1.702e-03 7.930 4.51e-15 ***
## c15:Neighborhood1NN2 1.272e-02 1.502e-03 8.465 < 2e-16 ***
## c16:Neighborhood1NN2 1.180e-02 1.503e-03 7.847 8.51e-15 ***
## c17:Neighborhood1NN2 1.093e-02 1.776e-03 6.151 1.01e-09 ***
## c18:Neighborhood1NN2 1.008e-02 2.591e-03 3.890 0.000105 ***
## c19:Neighborhood1NN2 NA NA NA NA
## c1N12:Neighborhood1NN2 NA NA NA NA
## c110:Neighborhood1TV 2.919e-03 4.013e-03 0.727 0.467207
## c13:Neighborhood1TV NA NA NA NA
## c14:Neighborhood1TV NA NA NA NA
## c15:Neighborhood1TV 1.185e-02 2.026e-03 5.848 6.22e-09 ***
## c16:Neighborhood1TV 9.163e-03 1.643e-03 5.578 2.92e-08 ***
## c17:Neighborhood1TV 6.173e-03 1.505e-03 4.102 4.34e-05 ***
## c18:Neighborhood1TV 4.322e-03 1.477e-03 2.925 0.003498 **
## c19:Neighborhood1TV NA NA NA NA
## c1N12:Neighborhood1TV NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00377 on 1372 degrees of freedom
## Multiple R-squared: 0.9102, Adjusted R-squared: 0.9046
## F-statistic: 161.7 on 86 and 1372 DF, p-value: < 2.2e-16
plot(tmod)
## Warning: not plotting observations with leverage one:
## 59, 376, 725, 818, 917, 1069, 1101, 1442
## Warning: not plotting observations with leverage one:
## 59, 376, 725, 818, 917, 1069, 1101, 1442
mydata3$SalePriceN<-NULL
distPred <- predict(tmod, mydata3)
## Warning in predict.lm(tmod, mydata3): prediction from a rank-deficient fit
## may be misleading
distpredF<-1/(distPred^12.5)
submission<-cbind(mydata3$Id,distpredF)
#submission
My Kaggle User Name is MikeGroysman and my best Kaggle Score is 0.13643 for the LASSO model above.
I developed 3 models, 2 manual and one LASSO based. I guess I cannot beat intelligent algorithm as LASSO was the best. I will keep trying. I cannot make LASSO to work today, so I will submit the latest model that works. I created the model manually. The model has issues - residuals have tails on both sides and variance is not constant. Maybe, this is why model did not perform as well as I expected.