#libraries for the final
library(matlib)
library(igraph)
library(expm)
library(ggplot2)
library(dplyr)
library(nnet)
library(stats)
library(matrixcalc)
library(ggpubr)Part 1
(Hi Larry! I might be very tired but I’m going to assume to you mean the page rank matrix given in the final. )
#Lets form matrix A
A<-matrix(c(0,.5,.5,0,0,0,0,0,0,0,0,0,1/3,1/3,0,0,1/3,0,0,0,0,0,.5,.5,0,0,0,.5,0,.5,0,0,0,1,0,0),nrow = 6,byrow = TRUE)
#We noticed before apply our decay formula that the markov matrix is not stochastic. We have a the second page being disconnected from our webpage
#here's were I don't which is correct: removing the third row or assuming that every page has the same probability of landing on page two
#we will go with the initial probability given in the reading
Alt_A<-matrix(c(0,.5,.5,0,0,0,1/6,1/6,1/6,1/6,1/6,1/6,1/3,1/3,0,0,1/3,0,0,0,0,0,.5,.5,0,0,0,.5,0,.5,0,0,0,1,0,0),nrow = 6,byrow = TRUE)
# To form Matrix B, we will use the formula for decay given in the final notes
B<-0.85*Alt_A+(0.15/nrow(Alt_A))
#transform a uniform rank vector r and perform power iterations til its converges r=B^nxr
r<-c(1/6,1/6,1/6,1/6,1/6,1/6)
#for a 100 iterations, take the power of matrix B and cross multiply with r til it converges
#the formula converges close to the vector given in the final
for (i in 1:30) {
print(t(B%^%i)%*%r)
}## [,1]
## [1,] 0.09583333
## [2,] 0.16666667
## [3,] 0.11944444
## [4,] 0.26111111
## [5,] 0.16666667
## [6,] 0.19027778
## [,1]
## [1,] 0.08245370
## [2,] 0.12318287
## [3,] 0.08934028
## [4,] 0.28118056
## [5,] 0.19342593
## [6,] 0.23041667
## [,1]
## [1,] 0.06776399
## [2,] 0.10280681
## [3,] 0.07749373
## [4,] 0.32051109
## [5,] 0.18726572
## [6,] 0.24415866
## [,1]
## [1,] 0.06152086
## [2,] 0.09032055
## [3,] 0.06836399
## [4,] 0.32668709
## [5,] 0.19773807
## [6,] 0.25536944
## [,1]
## [1,] 0.05716521
## [2,] 0.08331157
## [3,] 0.06394177
## [4,] 0.33889812
## [5,] 0.19600722
## [6,] 0.26067610
## [,1]
## [1,] 0.05491931
## [2,] 0.07921452
## [3,] 0.06109769
## [4,] 0.34168023
## [5,] 0.19895101
## [6,] 0.26413724
## [,1]
## [1,] 0.05353307
## [2,] 0.07687377
## [3,] 0.05956276
## [4,] 0.34529289
## [5,] 0.19874717
## [6,] 0.26599033
## [,1]
## [1,] 0.05276657
## [2,] 0.07551812
## [3,] 0.05864201
## [4,] 0.34644978
## [5,] 0.19951605
## [6,] 0.26710748
## [,1]
## [1,] 0.05231364
## [2,] 0.07473943
## [3,] 0.05812419
## [4,] 0.34753408
## [5,] 0.19955479
## [6,] 0.26773388
## [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
## [,1]
## [1,] 0.05190713
## [2,] 0.07403118
## [3,] 0.05764846
## [4,] 0.34830753
## [5,] 0.19979551
## [6,] 0.26831019
## [,1]
## [1,] 0.05182148
## [2,] 0.07388201
## [3,] 0.05754828
## [4,] 0.34846450
## [5,] 0.19985218
## [6,] 0.26843154
## [,1]
## [1,] 0.05177196
## [2,] 0.07379609
## [3,] 0.05749075
## [4,] 0.34857061
## [5,] 0.19986938
## [6,] 0.26850121
## [,1]
## [1,] 0.05174349
## [2,] 0.07374658
## [3,] 0.05745753
## [4,] 0.34862496
## [5,] 0.19988600
## [6,] 0.26854144
## [,1]
## [1,] 0.05172707
## [2,] 0.07371805
## [3,] 0.05743842
## [4,] 0.34865921
## [5,] 0.19989267
## [6,] 0.26856459
## [,1]
## [1,] 0.05171761
## [2,] 0.07370161
## [3,] 0.05742739
## [4,] 0.34867768
## [5,] 0.19989777
## [6,] 0.26857794
## [,1]
## [1,] 0.05171216
## [2,] 0.07369214
## [3,] 0.05742105
## [4,] 0.34868886
## [5,] 0.19990017
## [6,] 0.26858563
## [,1]
## [1,] 0.05170902
## [2,] 0.07368668
## [3,] 0.05741739
## [4,] 0.34869507
## [5,] 0.19990178
## [6,] 0.26859006
## [,1]
## [1,] 0.05170721
## [2,] 0.07368354
## [3,] 0.05741528
## [4,] 0.34869875
## [5,] 0.19990261
## [6,] 0.26859261
## [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
## [,1]
## [1,] 0.05170556
## [2,] 0.07368068
## [3,] 0.05741336
## [4,] 0.34870205
## [5,] 0.19990342
## [6,] 0.26859493
## [,1]
## [1,] 0.05170522
## [2,] 0.07368008
## [3,] 0.05741296
## [4,] 0.34870274
## [5,] 0.19990359
## [6,] 0.26859542
## [,1]
## [1,] 0.05170502
## [2,] 0.07367973
## [3,] 0.05741273
## [4,] 0.34870314
## [5,] 0.19990368
## [6,] 0.26859570
## [,1]
## [1,] 0.05170490
## [2,] 0.07367953
## [3,] 0.05741259
## [4,] 0.34870337
## [5,] 0.19990374
## [6,] 0.26859586
## [,1]
## [1,] 0.05170484
## [2,] 0.07367942
## [3,] 0.05741252
## [4,] 0.34870350
## [5,] 0.19990377
## [6,] 0.26859595
## [,1]
## [1,] 0.05170480
## [2,] 0.07367935
## [3,] 0.05741247
## [4,] 0.34870358
## [5,] 0.19990379
## [6,] 0.26859601
## [,1]
## [1,] 0.05170478
## [2,] 0.07367931
## [3,] 0.05741245
## [4,] 0.34870363
## [5,] 0.19990380
## [6,] 0.26859604
## [,1]
## [1,] 0.05170476
## [2,] 0.07367929
## [3,] 0.05741243
## [4,] 0.34870365
## [5,] 0.19990380
## [6,] 0.26859606
## [,1]
## [1,] 0.05170476
## [2,] 0.07367928
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
#Finding the Eigen decomposition its A=Q ^ Q-1
#Using eigen() to find the diagonal matrix and its eigen vectors
data<-eigen(B)
evalues<-data$values
dia<-diag(evalues)
Q<-data$vectors
#coming back to my previous comment, we do need that second row as the decomposition needs to be square (6x6). Use matrix mutlipication to solve i.e %*%
decom<-Q%*%dia%*%t(Q)
#lets peak at our eigen values to verify the highest eigen value is a one
print(evalues)## [1] 1.00000000+0i 0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
#lets see the vector found through our power iteration sums to one
test<-sum(t(B%^%100)%*%r)
sprintf("Looking at our power iteration the vector sum is %1.2f",test)## [1] "Looking at our power iteration the vector sum is 1.00"
#graphing the page rank vector of the one with and without decay
#Graph package does not exist anymore :(, will use igraph
plot(graph.adjacency(A,weighted = TRUE,mode = "directed"))plot(graph.adjacency(B,weighted = TRUE,mode = "directed"))Part 2
(Similar Taste to the eigen images-> The toughest thing learned. Will need some additional resources walking through this set! The r-studio I am using does not want to process the images given so please email me if this doc does not let the images appear!)
(For the resources used to aid me in this problem, I took the inspiration from the rdocumenation page of minst function to understand the structure of the pixel For a understanding in PCA, I used the article on PCA to help on th [discussion] (https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html) and use the formula for the cumlative proportion here Learning PCA predictions
#Loading in our data sets
train_data<-read.csv("train.csv")
test_data<-read.csv("test.csv")
#plot the first 10 images of the set
# Reading the Kaggle of our data set, each row consist of 785 columns "pixels"
#to create one image, the data set states each image dimensions are 28x28 pixels
#so we create a new matrix of each row and resize to the image dimension
#in order to print the image, we need to convert the matrix of character to numbers with the apply ()
#To make sure the image is viewable, in image() resize the dimensions and print with a gray scale
for (i in 0:11){
t<-matrix(train_data[i,],nrow=28,ncol=28)
t1<- apply(t, 2, as.numeric)
image(1:28, 1:28, t1, col=gray((0:255)/255))
}#Go ahead and divide all pixels by 255 to produce values between 0 and 1.
#we will divide each image (i.e T1) by 255 and not the original matrix as this is the normalization of the pixels, so each row (image) is divided by 255. In order to do this , we have to transform our DF into a vector for the normalization
normP<-as.vector(unlist(train_data[,2:785]/255))
data_scale2<-array(normP,dim=c(nrow(train_data),28,28))
#When I tried with the org data set, the labels were only ten with similar values
data_scale<-train_data/255
#the frequency distribution of the numbers in the dataset
#Looking at the chart below, it appears there is a strong pixel density around the .003->.02 range in the pixels
#This could be the concentration of the handwritten number are drawn around this area compare to the pi
freq<-data_scale%>%group_by(label)%>%summarise(count=n())%>% arrange(desc(label))
print(freq)## # A tibble: 10 x 2
## label count
## <dbl> <int>
## 1 0.0353 4188
## 2 0.0314 4063
## 3 0.0275 4401
## 4 0.0235 4137
## 5 0.0196 3795
## 6 0.0157 4072
## 7 0.0118 4351
## 8 0.00784 4177
## 9 0.00392 4684
## 10 0 4132
#With our normalized data, the pixel frequency is seen below
## The top pixels are closest to 1
freq2<-table(normP)
tail(freq2)## normP
## 0.980392156862745 0.984313725490196 0.988235294117647 0.992156862745098
## 40161 78319 465950 1066447
## 0.996078431372549 1
## 560067 223310
#For each number, provide the mean pixel intensity
#we can take for each image its vector values and find its mean
#Looking at the ten images, it appears characters that require more strokes have a higher pixel density
for (j in 1:10){
t<-matrix(train_data[j,],nrow=28,ncol=28)
t1<- apply(t, 2, as.numeric)
m<- mean(unlist(t1))
print(m)
}## [1] 21.23724
## [1] 56.89923
## [1] 17.125
## [1] 19.16964
## [1] 65.16964
## [1] 29.41454
## [1] 21.88648
## [1] 30.98342
## [1] 35.61862
## [1] 40.27168
#Reduce the data by using principal components that account for 95% of the variance
#using prcomp to find out the principle components in the original matrix
# My understanding of pca-> We are finding the covariance of the matrix using the most efficient dimensions to algin with our data
#To find the # of component that will generate variability of 95% we will look at the cumulative proportion
#If there were way smaller pixels, summary of pca would show the cumulative proportion but we will calculate that below
train_PCA<-prcomp(train_data)
#summary(train_PCA) Matrix is too big to print out
#To calculate cumulative proportion, we need the std of the matrix, its eigen vectors
pca_sd<-train_PCA$sdev
pca_cumprop<-cumsum(pca_sd^2)/sum(pca_sd^2)
# It appears that the efficient number of components is around 150
qplot(1:785,pca_cumprop)+geom_line()+ geom_hline(yintercept=.95, linetype="dashed", color = "pink")+xlim(0,200)+theme_classic()+labs(title="Cummlative Proporition of the train dataset", xlab="# of Components", y="Cummaltive explained variance")#Plot the first 10 images generated by PCA. They will appear to be noise
#They appear as noise as there are a not a a lot of vectors with accounted details, its just colors (definitely did something wrong but i will push forward)
#we will have to reduce the dimensionalilty to get a clearer picture
pca_img<-train_PCA$rotation
for (k in 1:10){
image(1:28,1:28,array(pca_img[,k],dim=c(28,28)),col=gray((0:255)/255))
}#select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance
#we will follow the same steps above
#I noticed the characters are more define than the previously PCA run, It could be there is less noise for character 8 compared to the other labels in the chart
e_only<-train_data%>%filter(label==8)
ePCA<-prcomp(e_only)
e_sd<-ePCA$sdev
pca_cumprop<-cumsum(e_sd^2)/sum(e_sd^2)
pca_img<-ePCA$rotation
for (l in 1:10){
image(1:28,1:28,array(pca_img[,l],dim=c(28,28)))
}#Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix)
#first, we break out the x and y of our matrix, so labels in the y vector and our values in for x
#Then, we can build of mutlimodel regression. Learned from the first attempt to increase the weight to accomodate the model
x_tr<-train_data[0:14000,2:785]
y_tr<-train_data[0:14000,1]
multi_model<-multinom(y_tr~.,x_tr,MaxNWts=10000)## # weights: 7860 (7065 variable)
## initial value 32236.191302
## iter 10 value 7241.767111
## iter 20 value 5183.685696
## iter 30 value 4666.784980
## iter 40 value 4458.317898
## iter 50 value 4348.716039
## iter 60 value 4290.724407
## iter 70 value 4259.056822
## iter 80 value 4232.320111
## iter 90 value 4201.865931
## iter 100 value 4175.612162
## final value 4175.612162
## stopped after 100 iterations
#Test the current model on our test set and see its accuracy via the confusion matrix
x_ts<-train_data[14000:28000,2:785]
y_ts<-train_data[14000:28000,1]
predict_multi<-predict(multi_model,newdata = x_ts)
# Flag| The caret package is not updated for this current version of r
#I will include a screenshot of this section in datacamp| The code used will be commented out :(
#Test the current model on our test set and see its accuracy via the confusion matrix
# x_ts<-train_data[14000:28000,2:785]
# y_ts<-train_data[14000:28000,1]
# predict_multi<-predict(multi_model,newdata = x_ts)
# confusionMatrix(predict_multi, as.factor(y_ts))
#It appears the predictions have 87% accuracy
I have also included a link to check
the statistics if needed here
Part 3
#loading in our housing data
house_train<-read.csv("train_house.csv")
house_test<-read.csv("test_house.csv")
#Provide univariate descriptive statistics and appropriate plots for the training data set
# we can use summary for for descriptive analysis of each (quantitative/qualative) variable on the chart
summary(house_train)## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
#Provide a scatter plot matrix for at least two of the independent variables and the dependent variable
ggplot(house_train,aes(x=OverallQual,y=MasVnrArea))+geom_point()+theme_classic()## Warning: Removed 8 rows containing missing values (geom_point).
# Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
#For proving our null hypothesis that there are no correlation, we can use corr.test with a confidence level of 80% to check for each pairwise variable
mCols<-house_train%>%select(YearRemodAdd,LotArea,MoSold)
corrM<-cor(mCols)
corrM## YearRemodAdd LotArea MoSold
## YearRemodAdd 1.00000000 0.013788427 0.021490002
## LotArea 0.01378843 1.000000000 0.001204988
## MoSold 0.02149000 0.001204988 1.000000000
#None of our pairwise reject the null hypothesis as their p-value > 0.05
#we don't have to worry by a family wise error as we test each pair individually instead of a group
#Individually we can see there's a correlation between the two variables or test which combo will reject
#seen below, there were no pairs that could reject which eliminates a false positive chance. If one of them did reject, then we would worry
cor.test(house_train$YearRemodAdd,house_train$LotArea,conf.level=.80)##
## Pearson's product-moment correlation
##
## data: house_train$YearRemodAdd and house_train$LotArea
## t = 0.52654, df = 1458, p-value = 0.5986
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.01978237 0.04732816
## sample estimates:
## cor
## 0.01378843
cor.test(house_train$YearRemodAdd,house_train$MoSold,conf.level=.80)##
## Pearson's product-moment correlation
##
## data: house_train$YearRemodAdd and house_train$MoSold
## t = 0.82076, df = 1458, p-value = 0.4119
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.01208035 0.05501196
## sample estimates:
## cor
## 0.02149
cor.test(house_train$LotArea,house_train$MoSold,conf.level=.80)##
## Pearson's product-moment correlation
##
## data: house_train$LotArea and house_train$MoSold
## t = 0.046011, df = 1458, p-value = 0.9633
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.03235796 0.03476522
## sample estimates:
## cor
## 0.001204988
# Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)
#Use solve() to retrieve the inverse
precMatrix<-solve(corrM)
# Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#Lets use the lu decomp function to perform the decomposition than coding by hand
round(corrM %*% precMatrix)## YearRemodAdd LotArea MoSold
## YearRemodAdd 1 0 0
## LotArea 0 1 0
## MoSold 0 0 1
round(precMatrix%*% corrM)## YearRemodAdd LotArea MoSold
## YearRemodAdd 1 0 0
## LotArea 0 1 0
## MoSold 0 0 1
lu.decomposition(corrM)## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000000 0
## [2,] 0.01378843 1.0000000000 0
## [3,] 0.02149000 0.0009088477 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.01378843 0.0214900022
## [2,] 0 0.99980988 0.0009086749
## [3,] 0 0.00000000 0.9995373540
# Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the
# minimum value is absolutely above zero if necessary.Then load the MASS package and run fitdistr to fit
# an exponential probability density function.
#Sales prices was one of the variables that was right skewed and did not have any NA values
ggplot(house_train,aes(x=SalePrice))+geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Unloading the matlib package as we cannot use the mass as its one of its dependents
detach("package:matlib", unload=TRUE)
library(MASS)## Warning: package 'MASS' was built under R version 4.1.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
alp<-fitdistr(house_train$SalePrice,densfun="exponential")
# then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.
sam_alp<-rexp(1000,alp$estimate)
g1<-qplot(sam_alp)+geom_histogram()+labs(title="Optimially fitted sample values")+theme_light()
g2<-ggplot(house_train,aes(x=SalePrice))+geom_histogram()+labs(title="Original value")+theme_light()
fig<-ggarrange(g1,g1,ncol=2)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig# Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function
#Let's use qexp to find the probability of receiving the 5th and 95th percentile
lp<-qexp(0.05,alp$estimate,lower.tail = TRUE)
hp<-qexp(0.95,alp$estimate,lower.tail = TRUE)
# Also generate a 95% confidence interval from the empirical data, assuming normality
#We can use t.test()to retrieve the confidence interval of our SalesPrice
#There's a 95% chance that the mean of a house' sale price falls in-between $176,842.90 to $184,999.60 in the dataset (either way i can't afford it )
con95<-t.test(house_train$SalePrice,conf.level = 0.95)
con95$conf.int## [1] 176842.8 184999.6
## attr(,"conf.level")
## [1] 0.95
#provide the empirical 5th percentile and 95th percentile of the data.
#This mean there's houses at $88K which is less than 95% the average house price. The houses costing $326K are at the top highest value in the housing market
quantile(house_train$SalePrice,c(0.05,0.95))## 5% 95%
## 88000 326100
# Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.
#let's work less and use the the step() function to find the best fit variables that affects our SalesPrice
#First, lets gather all the possible variables that can affect our house's sales price, It cn only be variables that have more than two levels for the lm to process
#Then, the step function will pop terms that have no effect to the AIC score, We do have to eliminate all NAs from the training source
complete_lm<-lm(formula=SalePrice~ MSSubClass + MSZoning+LotArea+ LotConfig + LandSlope + Neighborhood + Condition1 + Condition2+BldgType+HouseStyle+OverallQual+ OverallCond + YearBuilt+YearRemodAdd+RoofStyle+ RoofMatl + Exterior1st + Exterior2nd + MasVnrType+ MasVnrArea+ExterQual +ExterCond+ Foundation+ BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 +BsmtFinSF1 +BsmtFinType2+ BsmtFinSF2+ BsmtUnfSF +TotalBsmtSF+ Heating+ HeatingQC +Electrical + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath+FullBath+BedroomAbvGr+KitchenAbvGr +KitchenQual +TotRmsAbvGrd+Functional +Fireplaces + FireplaceQu +GarageType + GarageYrBlt+ GarageFinish + GarageCars + GarageArea +GarageQual +GarageCond+WoodDeckSF +OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+MiscVal+MoSold+YrSold+SaleType+SaleCondition,data=house_train)
#step(complete_lm,direction = "backward")
#There are NAs in the chart, which stop the step function from doing its magic. However, grabbing the AIC chart from the function I sorted in excel the largest AIC into its own LM regression line
#Back onto NAs, i noticed that I cannot remove instances of NAs as there are no full instance, which brought problems with NA predictions
aic_lm<-lm(formula = SalePrice~RoofMatl+Condition2+Neighborhood+X2ndFlrSF+BsmtFinSF1+OverallCond+X1stFlrSF+LotConfig+KitchenQual+OverallQual+PoolArea+LotFrontage+LotArea+GarageArea+SaleCondition+BsmtUnfSF+BsmtFinSF2+EnclosedPorch+ExterQual+X3SsnPorch
, data = house_train)
summary(aic_lm)##
## Call:
## lm(formula = SalePrice ~ RoofMatl + Condition2 + Neighborhood +
## X2ndFlrSF + BsmtFinSF1 + OverallCond + X1stFlrSF + LotConfig +
## KitchenQual + OverallQual + PoolArea + LotFrontage + LotArea +
## GarageArea + SaleCondition + BsmtUnfSF + BsmtFinSF2 + EnclosedPorch +
## ExterQual + X3SsnPorch, data = house_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163393 -12165 740 12051 171072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.166e+05 4.131e+04 -17.348 < 2e-16 ***
## RoofMatlCompShg 7.112e+05 3.303e+04 21.533 < 2e-16 ***
## RoofMatlMembran 7.334e+05 4.423e+04 16.582 < 2e-16 ***
## RoofMatlRoll 6.860e+05 4.249e+04 16.144 < 2e-16 ***
## RoofMatlTar&Grv 6.945e+05 3.437e+04 20.205 < 2e-16 ***
## RoofMatlWdShake 7.092e+05 3.848e+04 18.433 < 2e-16 ***
## RoofMatlWdShngl 7.882e+05 3.446e+04 22.873 < 2e-16 ***
## Condition2Feedr 1.344e+04 2.239e+04 0.600 0.548583
## Condition2Norm 2.028e+04 1.941e+04 1.045 0.296318
## Condition2PosA 1.620e+04 3.410e+04 0.475 0.634796
## Condition2PosN -2.126e+05 2.783e+04 -7.638 4.66e-14 ***
## Condition2RRNn 3.058e+04 2.737e+04 1.117 0.264097
## NeighborhoodBlueste -9.325e+03 2.077e+04 -0.449 0.653550
## NeighborhoodBrDale -1.402e+04 1.049e+04 -1.336 0.181807
## NeighborhoodBrkSide -8.594e+03 8.895e+03 -0.966 0.334130
## NeighborhoodClearCr -1.539e+04 1.125e+04 -1.368 0.171687
## NeighborhoodCollgCr 7.892e+02 7.829e+03 0.101 0.919728
## NeighborhoodCrawfor 6.134e+03 8.970e+03 0.684 0.494190
## NeighborhoodEdwards -1.856e+04 8.416e+03 -2.206 0.027610 *
## NeighborhoodGilbert 1.518e+03 8.594e+03 0.177 0.859826
## NeighborhoodIDOTRR -1.943e+04 9.392e+03 -2.069 0.038739 *
## NeighborhoodMeadowV -1.473e+04 1.066e+04 -1.381 0.167467
## NeighborhoodMitchel -1.731e+04 9.081e+03 -1.906 0.056942 .
## NeighborhoodNAmes -1.678e+04 8.196e+03 -2.047 0.040856 *
## NeighborhoodNoRidge 4.123e+04 9.253e+03 4.456 9.19e-06 ***
## NeighborhoodNPkVill -6.431e+03 1.289e+04 -0.499 0.618028
## NeighborhoodNridgHt 2.854e+04 8.247e+03 3.460 0.000559 ***
## NeighborhoodNWAmes -1.974e+04 8.868e+03 -2.226 0.026221 *
## NeighborhoodOldTown -2.806e+04 8.469e+03 -3.313 0.000951 ***
## NeighborhoodSawyer -1.483e+04 8.932e+03 -1.660 0.097139 .
## NeighborhoodSawyerW -5.453e+03 8.506e+03 -0.641 0.521619
## NeighborhoodSomerst 1.137e+04 7.983e+03 1.424 0.154733
## NeighborhoodStoneBr 4.930e+04 9.625e+03 5.122 3.54e-07 ***
## NeighborhoodSWISU -2.005e+04 9.793e+03 -2.047 0.040844 *
## NeighborhoodTimber 6.006e+03 9.053e+03 0.663 0.507182
## NeighborhoodVeenker 8.735e+03 1.300e+04 0.672 0.501627
## X2ndFlrSF 5.093e+01 2.418e+00 21.060 < 2e-16 ***
## BsmtFinSF1 4.403e+01 3.836e+00 11.477 < 2e-16 ***
## OverallCond 5.389e+03 8.428e+02 6.395 2.34e-10 ***
## X1stFlrSF 5.001e+01 4.341e+00 11.522 < 2e-16 ***
## LotConfigCulDSac 2.349e+04 4.958e+03 4.737 2.44e-06 ***
## LotConfigFR2 -2.741e+03 5.272e+03 -0.520 0.603201
## LotConfigFR3 -1.926e+04 1.407e+04 -1.369 0.171397
## LotConfigInside 8.027e+02 2.243e+03 0.358 0.720498
## KitchenQualFa -3.369e+04 6.882e+03 -4.895 1.13e-06 ***
## KitchenQualGd -2.824e+04 3.951e+03 -7.148 1.57e-12 ***
## KitchenQualTA -3.354e+04 4.514e+03 -7.430 2.13e-13 ***
## OverallQual 1.034e+04 1.106e+03 9.351 < 2e-16 ***
## PoolArea 7.062e+01 2.223e+01 3.177 0.001528 **
## LotFrontage 2.158e+02 4.702e+01 4.588 4.96e-06 ***
## LotArea 5.395e-01 1.217e-01 4.435 1.01e-05 ***
## GarageArea 2.833e+01 5.045e+00 5.615 2.47e-08 ***
## SaleConditionAdjLand 1.637e+04 1.449e+04 1.130 0.258838
## SaleConditionAlloca 8.114e+03 9.453e+03 0.858 0.390895
## SaleConditionFamily -5.331e+03 7.124e+03 -0.748 0.454430
## SaleConditionNormal 8.044e+03 3.170e+03 2.537 0.011299 *
## SaleConditionPartial 2.883e+04 4.306e+03 6.695 3.39e-11 ***
## BsmtUnfSF 1.731e+01 3.754e+00 4.612 4.44e-06 ***
## BsmtFinSF2 2.684e+01 6.195e+00 4.333 1.60e-05 ***
## EnclosedPorch -2.291e+01 1.408e+01 -1.627 0.103952
## ExterQualFa -4.397e+04 1.051e+04 -4.184 3.09e-05 ***
## ExterQualGd -4.094e+04 5.508e+03 -7.432 2.09e-13 ***
## ExterQualTA -4.594e+04 6.254e+03 -7.345 3.91e-13 ***
## X3SsnPorch 1.420e+01 2.733e+01 0.520 0.603444
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26890 on 1137 degrees of freedom
## (259 observations deleted due to missingness)
## Multiple R-squared: 0.9015, Adjusted R-squared: 0.896
## F-statistic: 165.1 on 63 and 1137 DF, p-value: < 2.2e-16
#forming our tester for the kaggle competition
pred_price<-predict(aic_lm,newdata = house_test)
#For any NA prediction, let's fill in with the closest estimated mean
#My score was 0.21!!!!!
ans<-data.frame(Id=house_test[,1],SalePrice=pred_price)
ans[is.na(ans)]=con95$estimate
write.csv(ans,file="Final_kaggleVH.csv")My username is vylvspebs