DATA 605 - COMPUTATIONAL MATHEMATICS, FINAL

library(e1071)
library(pander)
library(ggplot2)
library(reshape2)
library(MASS)
library(matrixcalc)
library(car)
library(psych)
library(compositions)
library(DiagrammeR)
library(pracma)
library(knitr)
library(stats)
library(kableExtra)
library(grid)
library(gridExtra)
library(data.table)
library(RCurl)
options(scipen=8)
set.seed(0305)

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of (N+1)/2.

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

#Random uniform and normal generators
X <- runif(10000, min = 1, max=6)
head(X)
## [1] 5.002059 5.852818 3.280193 3.015978 2.319642 5.673100
Y <- rnorm(10000,mean=(7/2),sd=(7/2))
x=median(X)
y<-quantile(Y, 0.25)
probstr <-as.data.frame(cbind(X,Y))  
summary(probstr)
##        X               Y         
##  Min.   :1.001   Min.   :-9.592  
##  1st Qu.:2.260   1st Qu.: 1.109  
##  Median :3.495   Median : 3.469  
##  Mean   :3.509   Mean   : 3.496  
##  3rd Qu.:4.767   3rd Qu.: 5.859  
##  Max.   :6.000   Max.   :16.886
#Create a matrix based on the probabilities conditions
#Total Observations
total.obs<-nrow(probstr)

#a.   P(X>x | X>y)      b.  P(X>x, Y>y)     c.  P(X<x | X>y)        

#Number of observations where X>x and X>y
XgxXgy.data <- probstr[which( probstr$X >x & probstr$X>y),]
XgxXgy <-  nrow(XgxXgy.data)

#Number of observations X>x and Y>y
XgxYgy.data <- probstr[which(probstr$X>x & probstr$Y>y),] 
XgxYgy <- nrow(XgxYgy.data)

#Number of observations where X<x and X>y
XlxXgy.dat <- probstr[which(probstr$X<=x & probstr$X>y),]
XlxXgy <- nrow(XlxXgy.dat)

#Number of observations where X>x and Y<y
XgxYly.dat <- probstr[which(probstr$X>x & probstr$Y<y),]
XgxYly <- nrow(XgxYly.dat)

#Number of observations where Xlx and Ygy
XlxYgy.dat <- probstr[which(probstr$X<x & probstr$Y>y),]
XlxYgy <- nrow(XlxYgy.dat)

#Number of observations where Xlx and Yly
XlxYly.dat <- probstr[which(probstr$X<x & probstr$Y<y),]
XlxYly <- nrow(XlxYly.dat)

#XlxYgy,XlxYly
#Number of observations where  X>y
Xgy.dat <- probstr[which(probstr$X>y),]
Xgy <- nrow(Xgy.dat)

#Number of observations where X>x
Xgx.dat <- probstr[which(probstr$X>x),]
Xgx <-nrow(Xgx.dat)

#Number of observations where Y>y
Ygy.dat <- probstr[which(probstr$Y>y),]
Ygy <-nrow(Ygy.dat)

#Number of observations where X<x
Xlx.dat <- probstr[which(probstr$X<x),]
Xlx <-nrow(Xgx.dat)

#Number of observations where Y<y
Yly.dat <- probstr[which(probstr$Y<y),]
Yly <-nrow(Yly.dat)

#Number of observations where X<y
Xly.dat <- probstr[which(probstr$X<y),]
Xly <-nrow(Xly.dat)

#Number of observations where Y<y
Ygx.dat <- probstr[which(probstr$Y>x),]
Ygx <-nrow(Ygx.dat)

#Number of observations where Y<y
Ylx.dat <- probstr[which(probstr$Y<x),]
Ylx <-nrow(Ylx.dat)

a. P(X>x | X>y)

#Calc 1
(length(X[X > x &  X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.5104124
#Calc 2

\(P(X>x|X>y)=\frac { P(X>x,X>y) }{ P(X>y) } =\frac { P(X>x)P(X>y) }{ P(X>y) }\)

This Reads as: There is 51% chance of X (uniform numbers) is greater than 3.59(aprox) given probability of Y (normal numbers) is greater than 1.11 (1st quantile of Y)

b. P(X>x, Y>y)

(length(X[X > x &  Y > y])/length(X)) 
## [1] 0.3754

\(P(X>x,Y>y)= { P(X>x)(X>y)}\)

This reads: There is 37.5% chance that X is greater than 3.59 and Y probability is greater than 1.11

This is known as a joint probability

c. P(Xy)

(length(X[X < x &  X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.4895876

This ReadsL There is 48% chance that X (uniform numbers) is less than 3.59(aprox) given probability of X is greater than 1.11 (1st quantile of Y)

\(P(X<x|X>y)=\frac { P(X<x,X>y) }{ P(X>y) } =\frac { P(X<x)P(X>y) }{ P(X>y) }\)

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

#Frequency Table
cont.table <- matrix(c(Xgx,Ygx,Xlx,Ylx,Xgy,Ygy,Xly,Yly),nrow=2,ncol=4)
rownames(cont.table) <- c('X','Y')
kable(cont.table, digits = 4, col.names = c('>x','<x','>y','<y'), align = "l", caption = 'Observed Data Joint Frequencies')
Observed Data Joint Frequencies
>x <x >y <y
X 5000 5000 9796 204
Y 4972 5028 7500 2500
#Contingency Table
cont.table <- matrix(c(XgxYgy,XgxYly,XlxYgy,XlxYly),nrow=2,ncol=2)
rownames(cont.table) <- c('(X>x)','(X<x)')
kable(cont.table, digits = 4, col.names = c('(Y>y)','(Y<y)'), align = "l", caption = 'Observed Data Joint Probabilities')
Observed Data Joint Probabilities
(Y>y) (Y<y)
(X>x) 3754 3746
(X<x) 1246 1254
#P(X>x and Y>y)=P(X>x)P(Y>y)

#Calculating  Joint Probs
Xgxp<-Xgx/total.obs
Ygyp<-Ygy/total.obs
Xlxp<-Xlx/total.obs
Ylyp<-Yly/total.obs
XgxYgyp<-round(XgxYgy/total.obs,3)
XgxYlyp<-round(XgxYly/total.obs,3)
XlxYgyp<-round(XgxYgy/total.obs,3)
XlxYlyp<-round(XgxYly/total.obs,3)

Testing Conditions with probs for

\(P(X>x and Y>y)=P(X>x)P(Y>y)\)

identical(XgxYgyp, Xgxp*Ygyp)
## [1] TRUE
XgxYgy/total.obs
## [1] 0.3754
Xgxp*Ygyp
## [1] 0.375

Chi-Square and Fisher’s

We can say that according to the validation and the test the Variables X and Y are independent let’s continue to do Chi Square and Fisher’s

Null Hypothesis (\(H_0\)): Probablity of X>x is not influenced by the probability of Y>y.

Alternative Hypothesis(\(H_A\)): Probablity of X>x has significant influence on the probability of Y>y.

#Chi Square Test
chisq.test(cont.table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont.table
## X-squared = 0.026133, df = 1, p-value = 0.8716
#Fisher Test 
fisher.test(cont.table)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  cont.table
## p-value = 0.8716
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9202847 1.1052820
## sample estimates:
## odds ratio 
##    1.00857

Both P-Values are way bigger than the 0.5 threshold reaffirming that both variables are Independent and we can accept the Null Hypothesis

Problem 2

Load Data from Kaggle

# Load training data from GitHub
path <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/train.csv')
con <- file(path, open="r")
train <- read.csv(con, header=T, stringsAsFactors = F)
close(con)

# Load test data from GitHub
path <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/test.csv')
con <- file(path, open="r")
test <- read.csv(con, header=T, stringsAsFactors = F)
close(con)

Define Variables

Pick two of the quantitative independent variables and one dependent variable from the training data set.

Looking at the mean and median values of each variable, as given by the summary function, we can choose our right-skewed variable to represent “X” by using a predictor whose mean is larger than the median. The variables that appear to be the most right-skewed are LotArea, TotalBsmtSF, GrLivArea, X1stFlrSF, BsmtFinSF1, MasVnrArea, OverallCond. LotArea seems to be more skewed than the others, so this will be chosen as our X variable for the sake of the first analysis we will select GrLivArea that is the Grade Above Living Area as second independent variable and will be defined as Z. SalePrice is the dependent variable, and we will define it as Y.

X <- train$LotArea
Z <- train$GrLivArea
Y <- train$SalePrice

Descriptive and Inferential Statistics.

Univariate descriptive statistics

Provide univariate descriptive statistics and appropriate plots for the training data set.

Due to the number of variables (81), a table of descriptive statistics for the numerical variables has been created. The table displays the mean, minimum value, median, maximum value, IQR, standard deviation, and skewness, as computed by the same function in the e1071 package.

# number of variable types
table(sapply(train, class))
## 
## character   integer 
##        43        38

Only numerical predictors were used, as there are 40+ character vectors. We could split up these character vectors by making binary categorical variables (1 for a value, 0 if not) for each value within the character vector, however this will result in many variables, most which probably won’t have great significance on predicting the Sale Price of the property in a later section when we need to create a model.

# subset training data numerical values
train_int <- train[c(2, 4, 5, 18:21, 27, 35, 37:39, 44:53, 55, 57, 60, 62:63, 67:72, 76:78, 81)]

means <- sapply(train_int, function(y) mean(y, na.rm = TRUE))
mins <- sapply(train_int, function(y) min(y, na.rm=TRUE))
medians <- sapply(train_int, function(y) median(y, na.rm = TRUE))
maxs <- sapply(train_int, function(y) max(y, na.rm=TRUE))
IQRs <- sapply(train_int, function(y) IQR(y, na.rm = TRUE))
SDs <- sapply(train_int, function(y) sd(y, na.rm = T))
skews <- sapply(train_int, function(y) skewness(y, na.rm = TRUE))

datasummary <- data.frame(means, mins, medians, maxs, IQRs, SDs, skews)
colnames(datasummary) <- c("MEAN", "MIN","MEDIAN", "MAX", "IQR", "STD. DEV", "SKEW")
datasummary <- round(datasummary, 2)

pander(datasummary)
  MEAN MIN MEDIAN MAX IQR STD. DEV SKEW
MSSubClass 56.9 20 50 190 50 42.3 1.4
LotFrontage 70.05 21 69 313 21 24.28 2.16
LotArea 10517 1300 9478 215245 4048 9981 12.18
OverallQual 6.1 1 6 10 2 1.38 0.22
OverallCond 5.58 1 5 9 1 1.11 0.69
YearBuilt 1971 1872 1973 2010 46 30.2 -0.61
YearRemodAdd 1985 1950 1994 2010 37 20.65 -0.5
MasVnrArea 103.7 0 0 1600 166 181.1 2.66
BsmtFinSF1 443.6 0 383.5 5644 712.2 456.1 1.68
BsmtFinSF2 46.55 0 0 1474 0 161.3 4.25
BsmtUnfSF 567.2 0 477.5 2336 585 441.9 0.92
TotalBsmtSF 1057 0 991.5 6110 502.5 438.7 1.52
X1stFlrSF 1163 334 1087 4692 509.2 386.6 1.37
X2ndFlrSF 347 0 0 2065 728 436.5 0.81
LowQualFinSF 5.84 0 0 572 0 48.62 8.99
GrLivArea 1515 334 1464 5642 647.2 525.5 1.36
BsmtFullBath 0.43 0 0 3 1 0.52 0.59
BsmtHalfBath 0.06 0 0 2 0 0.24 4.09
FullBath 1.57 0 2 3 1 0.55 0.04
HalfBath 0.38 0 0 2 1 0.5 0.67
BedroomAbvGr 2.87 0 3 8 1 0.82 0.21
KitchenAbvGr 1.05 0 1 3 0 0.22 4.48
TotRmsAbvGrd 6.52 2 6 14 2 1.63 0.67
Fireplaces 0.61 0 1 3 1 0.64 0.65
GarageYrBlt 1979 1900 1980 2010 41 24.69 -0.65
GarageCars 1.77 0 2 4 1 0.75 -0.34
GarageArea 473 0 480 1418 241.5 213.8 0.18
WoodDeckSF 94.24 0 0 857 168 125.3 1.54
OpenPorchSF 46.66 0 25 547 68 66.26 2.36
EnclosedPorch 21.95 0 0 552 0 61.12 3.08
X3SsnPorch 3.41 0 0 508 0 29.32 10.28
ScreenPorch 15.06 0 0 480 0 55.76 4.11
PoolArea 2.76 0 0 738 0 40.18 14.8
MiscVal 43.49 0 0 15500 0 496.1 24.43
MoSold 6.32 1 6 12 3 2.7 0.21
YrSold 2008 2006 2008 2010 2 1.33 0.1
SalePrice 180921 34900 163000 755000 84025 79443 1.88

To illustrate the descriptive statistics in the numeric variables, box plots have been created below. These plots were used because they effectively show the spread and skewness of the data, the mean, as well as any outliers.

train_int_melted <- melt(train_int)
## No id variables; using all as measure variables
ggplot(train_int_melted, aes(variable, value)) + geom_boxplot(aes(fill = variable), alpha = 0.75, show.legend = FALSE) + facet_wrap(~variable, scale="free") + scale_y_continuous('') + scale_x_discrete('', breaks = NULL) + ggtitle("Distribution of Predictor and Target Variables\n")
## Warning: Removed 348 rows containing non-finite values (stat_boxplot).

Scatterplot

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

pairs(~LotArea+GrLivArea+SalePrice, data=train, main="2 Independent 1 dependent variable")

Looking at the above matrix scatterplot, there is a very light linear relationship between the lot size and sale price of the house. Most of the sale prices are concentrated between 100k and 300k, while the lot sizes have much less spread. The larger lot sizes do not necessarily belong to the most expensive properties, which is why we do not see a stronger correlation. In fact, there are a handful of cases where the lot size is really skewing the relationship between the variable and the sale price of the property. The second relationship with the Livable Area shows a stronger linear approach and infering that’s easy not only from the graphic but common sense.

Hypothesis and 80% confidence interval

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Testing for:

H0: ρ = 0 (“the population correlation coefficient is 0; there is no association”)

H1: ρ ≠ 0 (“the population correlation coefficient is not 0; a nonzero correlation could exist”)

print('values of XZ --r,p')
## [1] "values of XZ --r,p"
zct <- corr.test(X,Z, alpha=0.2, minlength = 5)
zct$r
## [1] 0.2631162
zct$p
## [1] 1.520481e-24
print('values of XY --r,p')
## [1] "values of XY --r,p"
yct <- corr.test(X,Y, alpha=0.2, minlength = 5)
yct$r
## [1] 0.2638434
yct$p
## [1] 1.123139e-24
FWER <- 1-(1-0.2)^2
FWER
## [1] 0.36

The table below shows the number of cases, mean, and standard deviation for the X and Y variables:

VAR n \(\bar{x}\) \(s\)
X 1460 10516.83 9981.265
Y 1460 180921.2 79442.5
Z 1460 1515 525.5

We can derive the point estimate (the difference in means) by subtracting the mean of Y from the mean of X and same thing in the paiwise XZ.

The difference in means XY (point estimate) = -170404.4 The difference in means XZ (point estimate) = 9001.83

To get the standard error (\(SE\)) of the point estimate, we take the sum of the variance divided by the sample size, and take the square root of that value:

# Standard Error of Point Estimate
XYSE <- sqrt((var(X) / length(X)) + (var(Y) / length(Y)))
XZSE <- sqrt((var(X) / length(X)) + (var(Z) / length(Z)))

XYSE = 2095.451

XZSE = 261.58

The T-score is the point estimate divided by the standard error:

TXY = -170404.4 / 2095.451 = -81.321

TXZ = 9001.83 / 261.58 = 34.41

The confidence level for the difference in means can be calculated by taking the point estimate \(\pm\) the t-value associated with the degrees of freedom \(\times\) the standard error. Here we will use R’s t.test function to calculate the 80% confidence level that the difference in means is not zero:

t.test(X, Y, conf.level = 0.8)
## 
##  Welch Two Sample t-test
## 
## data:  X and Y
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
##  -173091.0 -167717.8
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20
t.test(X, Z, conf.level = 0.8)
## 
##  Welch Two Sample t-test
## 
## data:  X and Z
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
##  8665.981 9336.748
## sample estimates:
## mean of x mean of y 
## 10516.828  1515.464

From the output, the 80% confidence interval that the true difference in means lies between -173091.0 and -167717.8. The results are negative because the mean of Y is much larger than the mean of X.

And for the XZ Relationship there’s an 80% confidence interval that the true difference in means lies between 8665.981 and 9336.748

Both pairwise variables show positive relationship and very small p-values so we should reject the Null Hypothesis. Possibility of FWER is going to be high since we’re only executing a single experiment so probability wil be high however not meaningful for the experiments we’re solving here. R -Value is positive, P-Value is a very small value in both cases enough to reject the null hypothesis with confidence.

Linear Algebra and Correlation.

Invert your correlation matrix.

This is known as the precision matrix and contains variance inflation factors on the diagonal.

XY <- data.frame(X, Y)
cor_matrix <- cor(XY)
cor_matrix
##           X         Y
## X 1.0000000 0.2638434
## Y 0.2638434 1.0000000
prec_matrix <- solve(cor_matrix)
prec_matrix
##            X          Y
## X  1.0748219 -0.2835846
## Y -0.2835846  1.0748219

This inverted correlation matrix has variance inflation factors on the diagonal, which are a function of how closely one variable is a linear function of the other variable.

Matrices

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

cor_matrix %*% prec_matrix
##   X Y
## X 1 0
## Y 0 1
prec_matrix %*% cor_matrix
##   X Y
## X 1 0
## Y 0 1

Since the precision matrix is the inverse of the correlation matrix, multiplying the two together will result in the identity matrix, which we see in the resulting \(2 \times 2\) matrix with 1s on the diagonal.

In the above process, we switched the order of multiplying the correlation and the precision matrix, both resulting in an identity matrix. This is proving that the correlation matrix is invertible, since

\(A^{-1}A = I\) and \(AA^{-1} = I\)

Conduct LU Descomposition on the Matrix

LuA <- lu.decomposition(cor_matrix)
L <- LuA$L
U <- LuA$U
L
##           [,1] [,2]
## [1,] 1.0000000    0
## [2,] 0.2638434    1
U
##      [,1]      [,2]
## [1,]    1 0.2638434
## [2,]    0 0.9303867

Multiplying the descomposed matrix to obtain original and for proof

#Proof 
proof <- LuA$L%*%LuA$U
proof
##           [,1]      [,2]
## [1,] 1.0000000 0.2638434
## [2,] 0.2638434 1.0000000
cor_matrix
##           X         Y
## X 1.0000000 0.2638434
## Y 0.2638434 1.0000000

Calculus-Based Probability & Statistics.

For your variable that is skewed to the right, shift it so that the minimum value is above zero.

The minimum value of X (LotArea) is skewed to the right, but the lowest value is already above zero, as no negative lot sizes exist in the data set. In fact, none of the numerical variables have any values below 0, as indicated in the table in the descirptive statistics section.

min(X)
## [1] 1300

Load the MASS package and run fitdistr to fit an exponential probability density function.
(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).

expdf <- fitdistr(X, densfun="exponential")

Find the optimal value of $\lambda$ for this distribution.

In many examples, the exponential distribution is used to explain the probability of events occurring over a period of time (time elapsed between customers, weather events, phone calls, etc.). The probability of the even occurring is proportional to the length of the interval. In other words, if an event happens six times in an hour, \(\lambda = 6\), and the optimal value of \(\lambda\) would be the the average wait time (\(\frac{1}{\lambda}\), or 10 minutes), or the expected value.

For this distribution, a skewed variable of lot sizes in properties for sale was used. Instead of moments in time, \(\lambda\) is representing the increase in size of the property’s lot area.

# get value of lambda from exponential distribution
lambda <- expdf$estimate

# expected value of lambda
rate <- 1 / lambda
rate
##     rate 
## 10516.83

We can see that this is the same as the mean given in the descriptive statistics for the X variable (LotArea). Also, the value of \(\lambda\) is equal to one divided by this optimal value of lambda (\(\frac{1}{1056.83}\)).

Then, take 1000 samples from this exponential distribution using this value. (e.g., rexp(1000, some_val))

# 1000 samples from expon. dist. using lambda
expdf_samp <- rexp(1000, lambda)

Plot a histogram and compare it with a histogram of your original variable.

A histogram of the exponential fit of the original X variable (LotArea) is below:

hist(expdf_samp, col="royalblue", main="Histogram of Exponential Fit of X")

Here we see that the histogram follows the pattern of an exponential pdf; the highest count of values is first, and successively gets smaller with each bin. The distribution is right-skewed, but evenly.

Below is a histogram of the original X variable, without the exponential fit:

hist(X, col="royalblue", main="Histogram of X Variable", breaks=20)

The histogram of the original X variable is right-skewed, but much more so than the exponential fit of X. Looking at summary statistics supports illustrates the difference in the spread of the data.

Summary Stats of Exponential Fit of X:

summary(expdf_samp)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7.29  3288.17  7276.83 10230.06 13992.43 76516.83

Summary Stats of X:

summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245

The values of the observations of X are much more spread out, with the difference between the minimum and maximum value being 213945, but only 61169.37 in the exponential fit of X.

CDF

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

# 5 and 95 percentile of exponential pdf
qexp(c(.05, .95), rate = lambda)
## [1]   539.4428 31505.6013

Generate a 95% confidence interval from the empirical data, assuming normality

Assuming normality (normally distributed) for the originally observed data for X, another of R’s distribution functions (rbinom, rpois, rexp, etc.) can be used to generate the 95% confidence interval. Unlike the problem above, we are not looking for the 5th and 95th percentiles, but the 95% ci (0.05 / 2, because of two-tails).

qnorm(c(.025, .975), mean=mean(X), sd=sd(X))
## [1] -9046.092 30079.748

If the data for our X variable were normally distributed, the true value of the mean would lie between -9046.1 and 30079.748 in 95% of the sample intervals taken.

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

As performed in the first (probability) section, the 3rd quartile of X and 2nd quartile of Y were calculated using the quantile function in R. The same can be done to provide the 5th and 95th percentile of the observed values of X.

# 5th and 95th percentiles of empirical data X var
quantile(X, c(.05, .95))
##       5%      95% 
##  3311.70 17401.15

5% of the values will be equal to 3311.70 or less, and 5% of the values will be equal to or greater than 17401.15. 90% of the values will be in-between these two percentiles, so we essentially have a 90% confidence interval.

Modeling

In the descriptive statistics section, the number of categorical variables was found to be over 40. In keeping things somewhat simple, the subset of the train dataset’s numerical variables (minus the Id variable) is used for creating a predictive model.

First, a linear model using all of the numerical predictors is used:

model0 <- glm(SalePrice ~ ., data=train_int)
full_mod_summ <- summary(model0)

From the summary output, there are a number of predictor variables that have been flagged as highly significant (***). Let’s build a model using these highly-significant predictors:

# AIC 34834
model1 <- glm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + X1stFlrSF + X2ndFlrSF + BedroomAbvGr + GarageCars, data=train_int)

summary(model1)
## 
## Call:
## glm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + X1stFlrSF + X2ndFlrSF + BedroomAbvGr + 
##     GarageCars, data = train_int)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -466869   -18409    -2319    15022   274328  
## 
## Coefficients:
##                  Estimate   Std. Error t value         Pr(>|t|)    
## (Intercept)  -993242.4309   88871.5603 -11.176          < 2e-16 ***
## MSSubClass      -171.4795      24.9561  -6.871 0.00000000000942 ***
## LotArea            0.5727       0.1025   5.585 0.00000002792578 ***
## OverallQual    19990.2071    1129.8259  17.693          < 2e-16 ***
## OverallCond     5975.4051     953.9547   6.264 0.00000000049402 ***
## YearBuilt        463.8347      45.4750  10.200          < 2e-16 ***
## X1stFlrSF         80.7214       3.6522  22.102          < 2e-16 ***
## X2ndFlrSF         63.0789       3.3690  18.723          < 2e-16 ***
## BedroomAbvGr   -9771.5864    1487.0266  -6.571 0.00000000006940 ***
## GarageCars     11649.8427    1760.5266   6.617 0.00000000005135 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1336024492)
## 
##     Null deviance: 9207911334610  on 1459  degrees of freedom
## Residual deviance: 1937235513612  on 1450  degrees of freedom
## AIC: 34834
## 
## Number of Fisher Scoring iterations: 2

Another model is built using a log transformation on the LotArea variable, and replacing the X1stFlrSF, X2ndFlrSF, and BedroomAbvGr with GrLivArea and Fireplaces:

# AIC 34891
model2 <- glm(SalePrice ~ MSSubClass + log(LotArea) + OverallQual + OverallCond + YearBuilt + GrLivArea + GarageCars + Fireplaces, data=train_int)

summary(model2)
## 
## Call:
## glm(formula = SalePrice ~ MSSubClass + log(LotArea) + OverallQual + 
##     OverallCond + YearBuilt + GrLivArea + GarageCars + Fireplaces, 
##     data = train_int)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -380822   -19759    -1797    14676   295224  
## 
## Coefficients:
##                  Estimate   Std. Error t value         Pr(>|t|)    
## (Intercept)  -1246905.797    95127.686 -13.108          < 2e-16 ***
## MSSubClass       -126.766       27.534  -4.604 0.00000450588954 ***
## log(LotArea)    15648.989     2509.673   6.235 0.00000000058944 ***
## OverallQual     21537.900     1146.236  18.790          < 2e-16 ***
## OverallCond      5541.537      968.840   5.720 0.00000001293403 ***
## YearBuilt         520.955       46.726  11.149          < 2e-16 ***
## GrLivArea          50.321        2.748  18.310          < 2e-16 ***
## GarageCars      12490.180     1797.504   6.949 0.00000000000555 ***
## Fireplaces       8041.879     1768.696   4.547 0.00000590023871 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1389925390)
## 
##     Null deviance: 9207911334610  on 1459  degrees of freedom
## Residual deviance: 2016781741307  on 1451  degrees of freedom
## AIC: 34891
## 
## Number of Fisher Scoring iterations: 2

Lastly, a model using only a few predictors is built:

# AIC 35003
model3 <- glm(log(SalePrice) ~ log(LotArea) + OverallQual + YearBuilt + GrLivArea, data=train_int)
summary(model3)
## 
## Call:
## glm(formula = log(SalePrice) ~ log(LotArea) + OverallQual + YearBuilt + 
##     GrLivArea, data = train_int)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.92463  -0.07710   0.01250   0.09524   0.54073  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.64993531 0.36583013   7.244 7.06e-13 ***
## log(LotArea) 0.14942513 0.00952403  15.689  < 2e-16 ***
## OverallQual  0.13038058 0.00496359  26.267  < 2e-16 ***
## YearBuilt    0.00348343 0.00018658  18.670  < 2e-16 ***
## GrLivArea    0.00023143 0.00001167  19.829  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.029731)
## 
##     Null deviance: 232.801  on 1459  degrees of freedom
## Residual deviance:  43.259  on 1455  degrees of freedom
## AIC: -982.43
## 
## Number of Fisher Scoring iterations: 2

Looking at the coefficients for the predictor variables, we can see they all have a positive estimate, though all are very small values. This makes sense, since we can generally assume that the Lot size, Quality, year a property was built, and square footage of the home would have a positive relationships to the price (i.e. large, well-built, new houses on lots of land cost more).

The biggest difference for this model is transforming the response variable, since it is also skewed, but because of its simplicity, we’ll use this for predicting the Sale Price of the properties in the Kaggle dataset. Using R’s predict function, we’ll feed it the third reduced model, the test dataset, and store the results in an object.

predict_price <- predict(model3, test, type="response")

Taking this resulting vector, we’ll bind it to the Id column of the test data, then create a .csv from the data.

results <- data.frame(test$Id, exp(predict_price))  # use exp to transform the log of the response variable back
colnames(results) <- c("Id", "SalePrice")
head(results)
##     Id SalePrice
## 1 1461  125357.1
## 2 1462  161087.0
## 3 1463  172810.9
## 4 1464  187072.4
## 5 1465  198994.6
## 6 1466  186086.3
# Write to .csv for submission to Kaggle
#write.csv(results, file = "predicted_prices_sergio.csv", row.names = FALSE) # not for rendering R Markdown

Kaggle Submission

My Kaggle user name is Sortega78, and the resulting score on Kaggle.com from this model is 0.16805.

knitr::include_graphics("C:\\Users\\sergioor\\Desktop\\Old PC\\CUNY\\DATA605\\DATA605_Final-master\\kagglesergio.png")