Data 605 Final

#libraries for the final
library(matlib)
library(igraph)
library(expm) 
library(ggplot2)
library(dplyr)
library(nnet)
library(stats)
library(matrixcalc)
library(ggpubr)

Part 1

(Hi Larry! I might be very tired but I’m going to assume to you mean the page rank matrix given in the final. )

#Lets form matrix A
A<-matrix(c(0,.5,.5,0,0,0,0,0,0,0,0,0,1/3,1/3,0,0,1/3,0,0,0,0,0,.5,.5,0,0,0,.5,0,.5,0,0,0,1,0,0),nrow = 6,byrow = TRUE)

#We noticed before apply our decay formula that the markov matrix is not stochastic. We have a the second page being disconnected from our webpage
#here's were I don't which is correct: removing the third row or assuming that every page has the same probability of landing on page two
#we will go with the initial probability given in the reading
Alt_A<-matrix(c(0,.5,.5,0,0,0,1/6,1/6,1/6,1/6,1/6,1/6,1/3,1/3,0,0,1/3,0,0,0,0,0,.5,.5,0,0,0,.5,0,.5,0,0,0,1,0,0),nrow = 6,byrow = TRUE)

# To form Matrix B, we will use the formula for decay given in the final notes
B<-0.85*Alt_A+(0.15/nrow(Alt_A))

#transform a uniform rank vector r and perform power iterations til its converges r=B^nxr
r<-c(1/6,1/6,1/6,1/6,1/6,1/6)

#for a 100 iterations, take the power of matrix B and cross multiply with r til it converges
#the formula converges close to the vector given in the final
for (i in 1:30) {
  print(t(B%^%i)%*%r)
}

##            [,1]
## [1,] 0.09583333
## [2,] 0.16666667
## [3,] 0.11944444
## [4,] 0.26111111
## [5,] 0.16666667
## [6,] 0.19027778
##            [,1]
## [1,] 0.08245370
## [2,] 0.12318287
## [3,] 0.08934028
## [4,] 0.28118056
## [5,] 0.19342593
## [6,] 0.23041667
##            [,1]
## [1,] 0.06776399
## [2,] 0.10280681
## [3,] 0.07749373
## [4,] 0.32051109
## [5,] 0.18726572
## [6,] 0.24415866
##            [,1]
## [1,] 0.06152086
## [2,] 0.09032055
## [3,] 0.06836399
## [4,] 0.32668709
## [5,] 0.19773807
## [6,] 0.25536944
##            [,1]
## [1,] 0.05716521
## [2,] 0.08331157
## [3,] 0.06394177
## [4,] 0.33889812
## [5,] 0.19600722
## [6,] 0.26067610
##            [,1]
## [1,] 0.05491931
## [2,] 0.07921452
## [3,] 0.06109769
## [4,] 0.34168023
## [5,] 0.19895101
## [6,] 0.26413724
##            [,1]
## [1,] 0.05353307
## [2,] 0.07687377
## [3,] 0.05956276
## [4,] 0.34529289
## [5,] 0.19874717
## [6,] 0.26599033
##            [,1]
## [1,] 0.05276657
## [2,] 0.07551812
## [3,] 0.05864201
## [4,] 0.34644978
## [5,] 0.19951605
## [6,] 0.26710748
##            [,1]
## [1,] 0.05231364
## [2,] 0.07473943
## [3,] 0.05812419
## [4,] 0.34753408
## [5,] 0.19955479
## [6,] 0.26773388
##            [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
##            [,1]
## [1,] 0.05190713
## [2,] 0.07403118
## [3,] 0.05764846
## [4,] 0.34830753
## [5,] 0.19979551
## [6,] 0.26831019
##            [,1]
## [1,] 0.05182148
## [2,] 0.07388201
## [3,] 0.05754828
## [4,] 0.34846450
## [5,] 0.19985218
## [6,] 0.26843154
##            [,1]
## [1,] 0.05177196
## [2,] 0.07379609
## [3,] 0.05749075
## [4,] 0.34857061
## [5,] 0.19986938
## [6,] 0.26850121
##            [,1]
## [1,] 0.05174349
## [2,] 0.07374658
## [3,] 0.05745753
## [4,] 0.34862496
## [5,] 0.19988600
## [6,] 0.26854144
##            [,1]
## [1,] 0.05172707
## [2,] 0.07371805
## [3,] 0.05743842
## [4,] 0.34865921
## [5,] 0.19989267
## [6,] 0.26856459
##            [,1]
## [1,] 0.05171761
## [2,] 0.07370161
## [3,] 0.05742739
## [4,] 0.34867768
## [5,] 0.19989777
## [6,] 0.26857794
##            [,1]
## [1,] 0.05171216
## [2,] 0.07369214
## [3,] 0.05742105
## [4,] 0.34868886
## [5,] 0.19990017
## [6,] 0.26858563
##            [,1]
## [1,] 0.05170902
## [2,] 0.07368668
## [3,] 0.05741739
## [4,] 0.34869507
## [5,] 0.19990178
## [6,] 0.26859006
##            [,1]
## [1,] 0.05170721
## [2,] 0.07368354
## [3,] 0.05741528
## [4,] 0.34869875
## [5,] 0.19990261
## [6,] 0.26859261
##            [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
##            [,1]
## [1,] 0.05170556
## [2,] 0.07368068
## [3,] 0.05741336
## [4,] 0.34870205
## [5,] 0.19990342
## [6,] 0.26859493
##            [,1]
## [1,] 0.05170522
## [2,] 0.07368008
## [3,] 0.05741296
## [4,] 0.34870274
## [5,] 0.19990359
## [6,] 0.26859542
##            [,1]
## [1,] 0.05170502
## [2,] 0.07367973
## [3,] 0.05741273
## [4,] 0.34870314
## [5,] 0.19990368
## [6,] 0.26859570
##            [,1]
## [1,] 0.05170490
## [2,] 0.07367953
## [3,] 0.05741259
## [4,] 0.34870337
## [5,] 0.19990374
## [6,] 0.26859586
##            [,1]
## [1,] 0.05170484
## [2,] 0.07367942
## [3,] 0.05741252
## [4,] 0.34870350
## [5,] 0.19990377
## [6,] 0.26859595
##            [,1]
## [1,] 0.05170480
## [2,] 0.07367935
## [3,] 0.05741247
## [4,] 0.34870358
## [5,] 0.19990379
## [6,] 0.26859601
##            [,1]
## [1,] 0.05170478
## [2,] 0.07367931
## [3,] 0.05741245
## [4,] 0.34870363
## [5,] 0.19990380
## [6,] 0.26859604
##            [,1]
## [1,] 0.05170476
## [2,] 0.07367929
## [3,] 0.05741243
## [4,] 0.34870365
## [5,] 0.19990380
## [6,] 0.26859606
##            [,1]
## [1,] 0.05170476
## [2,] 0.07367928
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607

#Finding the Eigen decomposition its A=Q ^ Q-1
#Using eigen() to find the diagonal matrix and its eigen vectors
data<-eigen(B)
evalues<-data$values
dia<-diag(evalues)
Q<-data$vectors

#coming back to my previous comment, we do need that second row as the decomposition needs to be square (6x6). Use matrix mutlipication to solve i.e %*%
decom<-Q%*%dia%*%t(Q)

#lets peak at our eigen values to verify the highest eigen value is a one
print(evalues)

## [1]  1.00000000+0i  0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i

#lets see the vector found through our power iteration sums to one
test<-sum(t(B%^%100)%*%r)
sprintf("Looking at our power iteration the vector sum is %1.2f",test)

## [1] "Looking at our power iteration the vector sum is 1.00"

#graphing the page rank vector of the one with and without decay
#Graph package does not exist anymore :(, will use igraph
plot(graph.adjacency(A,weighted = TRUE,mode = "directed"))

plot(graph.adjacency(B,weighted = TRUE,mode = "directed"))

Part 2

(Similar Taste to the eigen images-> The toughest thing learned. Will need some additional resources walking through this set! The r-studio I am using does not want to process the images given so please email me if this doc does not let the images appear!)

(For the resources used to aid me in this problem, I took the inspiration from the rdocumenation page of minst function to understand the structure of the pixel For a understanding in PCA, I used the article on PCA to help on th [discussion] (https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html) and use the formula for the cumlative proportion here Learning PCA predictions

#Loading in our data sets
train_data<-read.csv("train.csv")
test_data<-read.csv("test.csv")

#plot the first 10 images of the set
# Reading the Kaggle of our data set, each row consist of 785 columns "pixels"
#to create one image, the data set states each image dimensions are 28x28 pixels
#so we create a new matrix of each row and resize to the image dimension
#in order to print the image, we need to convert the matrix of character to numbers with the apply ()
#To make sure the image is viewable, in image() resize the dimensions and print with a gray scale
for (i in 0:11){
    t<-matrix(train_data[i,],nrow=28,ncol=28)
      t1<- apply(t, 2, as.numeric)
      image(1:28, 1:28, t1, col=gray((0:255)/255))
}

#Go ahead and divide all pixels by 255 to produce values between 0 and 1.
#we will divide each image (i.e T1) by 255 and not the original matrix as this is the normalization of the pixels, so each row (image) is divided by 255. In order to do this , we have to transform our DF into a vector for the normalization
normP<-as.vector(unlist(train_data[,2:785]/255))
data_scale2<-array(normP,dim=c(nrow(train_data),28,28))
  
#When I tried with the org data set, the labels were only ten with similar values 
data_scale<-train_data/255

#the frequency distribution of the numbers in the dataset
#Looking at the chart below, it appears there is a strong pixel density around the .003->.02 range in the pixels
#This could be the concentration of the handwritten number are drawn around this area compare to the pi
freq<-data_scale%>%group_by(label)%>%summarise(count=n())%>% arrange(desc(label))
print(freq)

## # A tibble: 10 x 2
##      label count
##      <dbl> <int>
##  1 0.0353   4188
##  2 0.0314   4063
##  3 0.0275   4401
##  4 0.0235   4137
##  5 0.0196   3795
##  6 0.0157   4072
##  7 0.0118   4351
##  8 0.00784  4177
##  9 0.00392  4684
## 10 0        4132

#With our normalized data, the pixel frequency is seen below
## The top pixels are closest to 1
freq2<-table(normP)
tail(freq2)

## normP
## 0.980392156862745 0.984313725490196 0.988235294117647 0.992156862745098 
##             40161             78319            465950           1066447 
## 0.996078431372549                 1 
##            560067            223310

#For each number, provide the mean pixel intensity
#we can take for each image its vector values and find its mean
#Looking at the ten images, it appears characters that require more strokes have a higher pixel density
for (j in 1:10){
    t<-matrix(train_data[j,],nrow=28,ncol=28)
    t1<- apply(t, 2, as.numeric)
    m<- mean(unlist(t1))
      print(m)
}

## [1] 21.23724
## [1] 56.89923
## [1] 17.125
## [1] 19.16964
## [1] 65.16964
## [1] 29.41454
## [1] 21.88648
## [1] 30.98342
## [1] 35.61862
## [1] 40.27168

#Reduce the data by using principal components that account for 95% of the variance
#using prcomp to find out the principle components in the original matrix
# My understanding of pca-> We are finding the covariance of the matrix using the most efficient dimensions to algin with our data
#To find the # of component that will generate variability of 95% we will look at the cumulative proportion
#If there were way smaller pixels, summary of pca would show the cumulative proportion but we will calculate that below
train_PCA<-prcomp(train_data)
#summary(train_PCA) Matrix is too big to print out

#To calculate cumulative proportion, we need the std of the matrix, its eigen vectors
pca_sd<-train_PCA$sdev
pca_cumprop<-cumsum(pca_sd^2)/sum(pca_sd^2)

# It appears that the efficient number of components is around 150
qplot(1:785,pca_cumprop)+geom_line()+ geom_hline(yintercept=.95, linetype="dashed", color = "pink")+xlim(0,200)+theme_classic()+labs(title="Cummlative Proporition of the train dataset", xlab="# of Components", y="Cummaltive explained variance")

#Plot the first 10 images generated by PCA. They will appear to be noise
#They appear as noise as there are a not a a lot of vectors with accounted details, its just colors (definitely did something wrong but i will push forward)
#we will have to reduce the dimensionalilty to get a clearer picture
pca_img<-train_PCA$rotation
for (k in 1:10){
      image(1:28,1:28,array(pca_img[,k],dim=c(28,28)),col=gray((0:255)/255))
}

#select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance
#we will follow the same steps above 
#I noticed the characters are more define than the previously PCA run, It could be there is less noise for character 8 compared to the other labels in the chart
e_only<-train_data%>%filter(label==8)
ePCA<-prcomp(e_only)
e_sd<-ePCA$sdev
pca_cumprop<-cumsum(e_sd^2)/sum(e_sd^2)
pca_img<-ePCA$rotation
for (l in 1:10){
      image(1:28,1:28,array(pca_img[,l],dim=c(28,28)))
}

#Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix)
#first, we break out the x and y of our matrix, so labels in the y vector and our values in for x
#Then, we can build of mutlimodel regression. Learned from the first attempt to increase the weight to accomodate the model
x_tr<-train_data[0:14000,2:785]
y_tr<-train_data[0:14000,1]
multi_model<-multinom(y_tr~.,x_tr,MaxNWts=10000)

## # weights:  7860 (7065 variable)
## initial  value 32236.191302 
## iter  10 value 7241.767111
## iter  20 value 5183.685696
## iter  30 value 4666.784980
## iter  40 value 4458.317898
## iter  50 value 4348.716039
## iter  60 value 4290.724407
## iter  70 value 4259.056822
## iter  80 value 4232.320111
## iter  90 value 4201.865931
## iter 100 value 4175.612162
## final  value 4175.612162 
## stopped after 100 iterations

#Test the current model on our test set and see its accuracy via the confusion matrix
x_ts<-train_data[14000:28000,2:785]
y_ts<-train_data[14000:28000,1]

predict_multi<-predict(multi_model,newdata = x_ts)

# Flag| The caret package is not updated for this current version of r
#I will include a screenshot of this section in datacamp| The code used will be commented out :(

#Test the current model on our test set and see its accuracy via the confusion matrix
# x_ts<-train_data[14000:28000,2:785]
# y_ts<-train_data[14000:28000,1]

# predict_multi<-predict(multi_model,newdata = x_ts)
# confusionMatrix(predict_multi, as.factor(y_ts))
#It appears the predictions have 87% accuracy

part 1 of the screenshot part 2 of the screenshot part 3 of the screenshot I have also included a link to check the statistics if needed here

Part 3

#loading in our housing data
house_train<-read.csv("train_house.csv")
house_test<-read.csv("test_house.csv")

#Provide univariate descriptive statistics and appropriate plots for the training data set
# we can use summary for for descriptive analysis of each (quantitative/qualative) variable on the chart
summary(house_train)

##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
##

#Provide a scatter plot matrix for at least two of the independent variables and the dependent variable
ggplot(house_train,aes(x=OverallQual,y=MasVnrArea))+geom_point()+theme_classic()

## Warning: Removed 8 rows containing missing values (geom_point).

# Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
#For proving our null hypothesis that there are no correlation, we can use corr.test with a confidence level of 80% to check for each pairwise variable
mCols<-house_train%>%select(YearRemodAdd,LotArea,MoSold)
corrM<-cor(mCols)
corrM

##              YearRemodAdd     LotArea      MoSold
## YearRemodAdd   1.00000000 0.013788427 0.021490002
## LotArea        0.01378843 1.000000000 0.001204988
## MoSold         0.02149000 0.001204988 1.000000000

#None of our pairwise reject the null hypothesis as their p-value > 0.05
#we don't have to worry by a family wise error as we test each pair individually instead of a group
#Individually we can see there's a correlation between the two variables or test which combo will reject
#seen below, there were no pairs that could reject which eliminates a false positive chance. If one of them did reject, then we would worry
cor.test(house_train$YearRemodAdd,house_train$LotArea,conf.level=.80)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train$YearRemodAdd and house_train$LotArea
## t = 0.52654, df = 1458, p-value = 0.5986
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.01978237  0.04732816
## sample estimates:
##        cor 
## 0.01378843

cor.test(house_train$YearRemodAdd,house_train$MoSold,conf.level=.80)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train$YearRemodAdd and house_train$MoSold
## t = 0.82076, df = 1458, p-value = 0.4119
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.01208035  0.05501196
## sample estimates:
##     cor 
## 0.02149

cor.test(house_train$LotArea,house_train$MoSold,conf.level=.80)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train$LotArea and house_train$MoSold
## t = 0.046011, df = 1458, p-value = 0.9633
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.03235796  0.03476522
## sample estimates:
##         cor 
## 0.001204988

# Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) 
#Use solve() to retrieve the inverse
precMatrix<-solve(corrM)

# Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#Lets use the lu decomp function to perform the decomposition than coding by hand
round(corrM %*% precMatrix)

##              YearRemodAdd LotArea MoSold
## YearRemodAdd            1       0      0
## LotArea                 0       1      0
## MoSold                  0       0      1

round(precMatrix%*% corrM)

##              YearRemodAdd LotArea MoSold
## YearRemodAdd            1       0      0
## LotArea                 0       1      0
## MoSold                  0       0      1

lu.decomposition(corrM)

## $L
##            [,1]         [,2] [,3]
## [1,] 1.00000000 0.0000000000    0
## [2,] 0.01378843 1.0000000000    0
## [3,] 0.02149000 0.0009088477    1
## 
## $U
##      [,1]       [,2]         [,3]
## [1,]    1 0.01378843 0.0214900022
## [2,]    0 0.99980988 0.0009086749
## [3,]    0 0.00000000 0.9995373540

# Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the
# minimum value is absolutely above zero if necessary.Then load the MASS package and run fitdistr to fit
# an exponential probability density function. 
#Sales prices was one of the variables that was right skewed and did not have any NA values
ggplot(house_train,aes(x=SalePrice))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Unloading the matlib package as we cannot use the mass as its one of its dependents
detach("package:matlib", unload=TRUE)
library(MASS)

## Warning: package 'MASS' was built under R version 4.1.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

alp<-fitdistr(house_train$SalePrice,densfun="exponential")


# then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. 
sam_alp<-rexp(1000,alp$estimate)
g1<-qplot(sam_alp)+geom_histogram()+labs(title="Optimially fitted sample values")+theme_light()
g2<-ggplot(house_train,aes(x=SalePrice))+geom_histogram()+labs(title="Original value")+theme_light()
fig<-ggarrange(g1,g1,ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fig

# Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function 
#Let's use qexp to find the probability of receiving the 5th and 95th percentile
lp<-qexp(0.05,alp$estimate,lower.tail = TRUE)
hp<-qexp(0.95,alp$estimate,lower.tail = TRUE)

# Also generate a 95% confidence interval from the empirical data, assuming normality
#We can use t.test()to retrieve the confidence interval of our SalesPrice
#There's a 95% chance that the mean of a house' sale price falls in-between $176,842.90 to $184,999.60 in the dataset (either way i can't afford it )
con95<-t.test(house_train$SalePrice,conf.level = 0.95)
con95$conf.int

## [1] 176842.8 184999.6
## attr(,"conf.level")
## [1] 0.95

#provide the empirical 5th percentile and 95th percentile of the data.
#This mean there's houses at $88K which is less than 95% the average house price. The houses costing $326K are at the top highest value in the housing market
quantile(house_train$SalePrice,c(0.05,0.95))

##     5%    95% 
##  88000 326100

# Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. 

#let's work less and use the the step() function to find the best fit variables that affects our SalesPrice
#First, lets gather all the possible variables that can affect our house's sales price, It cn only be variables that have more than two levels for the lm to process
#Then, the step function will pop terms that have no effect to the AIC score, We do have to eliminate all NAs from the training source
complete_lm<-lm(formula=SalePrice~ MSSubClass + MSZoning+LotArea+ LotConfig +  LandSlope +  Neighborhood  + Condition1 + Condition2+BldgType+HouseStyle+OverallQual+ OverallCond + YearBuilt+YearRemodAdd+RoofStyle+ RoofMatl + Exterior1st + Exterior2nd  + MasVnrType+  MasVnrArea+ExterQual +ExterCond+ Foundation+ BsmtQual  + BsmtCond + BsmtExposure +  BsmtFinType1 +BsmtFinSF1 +BsmtFinType2+ BsmtFinSF2+ BsmtUnfSF +TotalBsmtSF+  Heating+ HeatingQC  +Electrical + X1stFlrSF +   X2ndFlrSF  + LowQualFinSF  +  GrLivArea + BsmtFullBath+FullBath+BedroomAbvGr+KitchenAbvGr +KitchenQual +TotRmsAbvGrd+Functional  +Fireplaces + FireplaceQu  +GarageType + GarageYrBlt+ GarageFinish + GarageCars + GarageArea  +GarageQual +GarageCond+WoodDeckSF +OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+MiscVal+MoSold+YrSold+SaleType+SaleCondition,data=house_train)
#step(complete_lm,direction = "backward")

#There are NAs in the chart, which stop the step function from doing its magic. However, grabbing the AIC chart from the function I sorted in excel the largest AIC into its own LM regression line
#Back onto NAs, i noticed that I cannot remove instances of NAs as there are no full instance, which brought problems with NA predictions
aic_lm<-lm(formula = SalePrice~RoofMatl+Condition2+Neighborhood+X2ndFlrSF+BsmtFinSF1+OverallCond+X1stFlrSF+LotConfig+KitchenQual+OverallQual+PoolArea+LotFrontage+LotArea+GarageArea+SaleCondition+BsmtUnfSF+BsmtFinSF2+EnclosedPorch+ExterQual+X3SsnPorch
, data = house_train)
summary(aic_lm)

## 
## Call:
## lm(formula = SalePrice ~ RoofMatl + Condition2 + Neighborhood + 
##     X2ndFlrSF + BsmtFinSF1 + OverallCond + X1stFlrSF + LotConfig + 
##     KitchenQual + OverallQual + PoolArea + LotFrontage + LotArea + 
##     GarageArea + SaleCondition + BsmtUnfSF + BsmtFinSF2 + EnclosedPorch + 
##     ExterQual + X3SsnPorch, data = house_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -163393  -12165     740   12051  171072 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.166e+05  4.131e+04 -17.348  < 2e-16 ***
## RoofMatlCompShg       7.112e+05  3.303e+04  21.533  < 2e-16 ***
## RoofMatlMembran       7.334e+05  4.423e+04  16.582  < 2e-16 ***
## RoofMatlRoll          6.860e+05  4.249e+04  16.144  < 2e-16 ***
## RoofMatlTar&Grv       6.945e+05  3.437e+04  20.205  < 2e-16 ***
## RoofMatlWdShake       7.092e+05  3.848e+04  18.433  < 2e-16 ***
## RoofMatlWdShngl       7.882e+05  3.446e+04  22.873  < 2e-16 ***
## Condition2Feedr       1.344e+04  2.239e+04   0.600 0.548583    
## Condition2Norm        2.028e+04  1.941e+04   1.045 0.296318    
## Condition2PosA        1.620e+04  3.410e+04   0.475 0.634796    
## Condition2PosN       -2.126e+05  2.783e+04  -7.638 4.66e-14 ***
## Condition2RRNn        3.058e+04  2.737e+04   1.117 0.264097    
## NeighborhoodBlueste  -9.325e+03  2.077e+04  -0.449 0.653550    
## NeighborhoodBrDale   -1.402e+04  1.049e+04  -1.336 0.181807    
## NeighborhoodBrkSide  -8.594e+03  8.895e+03  -0.966 0.334130    
## NeighborhoodClearCr  -1.539e+04  1.125e+04  -1.368 0.171687    
## NeighborhoodCollgCr   7.892e+02  7.829e+03   0.101 0.919728    
## NeighborhoodCrawfor   6.134e+03  8.970e+03   0.684 0.494190    
## NeighborhoodEdwards  -1.856e+04  8.416e+03  -2.206 0.027610 *  
## NeighborhoodGilbert   1.518e+03  8.594e+03   0.177 0.859826    
## NeighborhoodIDOTRR   -1.943e+04  9.392e+03  -2.069 0.038739 *  
## NeighborhoodMeadowV  -1.473e+04  1.066e+04  -1.381 0.167467    
## NeighborhoodMitchel  -1.731e+04  9.081e+03  -1.906 0.056942 .  
## NeighborhoodNAmes    -1.678e+04  8.196e+03  -2.047 0.040856 *  
## NeighborhoodNoRidge   4.123e+04  9.253e+03   4.456 9.19e-06 ***
## NeighborhoodNPkVill  -6.431e+03  1.289e+04  -0.499 0.618028    
## NeighborhoodNridgHt   2.854e+04  8.247e+03   3.460 0.000559 ***
## NeighborhoodNWAmes   -1.974e+04  8.868e+03  -2.226 0.026221 *  
## NeighborhoodOldTown  -2.806e+04  8.469e+03  -3.313 0.000951 ***
## NeighborhoodSawyer   -1.483e+04  8.932e+03  -1.660 0.097139 .  
## NeighborhoodSawyerW  -5.453e+03  8.506e+03  -0.641 0.521619    
## NeighborhoodSomerst   1.137e+04  7.983e+03   1.424 0.154733    
## NeighborhoodStoneBr   4.930e+04  9.625e+03   5.122 3.54e-07 ***
## NeighborhoodSWISU    -2.005e+04  9.793e+03  -2.047 0.040844 *  
## NeighborhoodTimber    6.006e+03  9.053e+03   0.663 0.507182    
## NeighborhoodVeenker   8.735e+03  1.300e+04   0.672 0.501627    
## X2ndFlrSF             5.093e+01  2.418e+00  21.060  < 2e-16 ***
## BsmtFinSF1            4.403e+01  3.836e+00  11.477  < 2e-16 ***
## OverallCond           5.389e+03  8.428e+02   6.395 2.34e-10 ***
## X1stFlrSF             5.001e+01  4.341e+00  11.522  < 2e-16 ***
## LotConfigCulDSac      2.349e+04  4.958e+03   4.737 2.44e-06 ***
## LotConfigFR2         -2.741e+03  5.272e+03  -0.520 0.603201    
## LotConfigFR3         -1.926e+04  1.407e+04  -1.369 0.171397    
## LotConfigInside       8.027e+02  2.243e+03   0.358 0.720498    
## KitchenQualFa        -3.369e+04  6.882e+03  -4.895 1.13e-06 ***
## KitchenQualGd        -2.824e+04  3.951e+03  -7.148 1.57e-12 ***
## KitchenQualTA        -3.354e+04  4.514e+03  -7.430 2.13e-13 ***
## OverallQual           1.034e+04  1.106e+03   9.351  < 2e-16 ***
## PoolArea              7.062e+01  2.223e+01   3.177 0.001528 ** 
## LotFrontage           2.158e+02  4.702e+01   4.588 4.96e-06 ***
## LotArea               5.395e-01  1.217e-01   4.435 1.01e-05 ***
## GarageArea            2.833e+01  5.045e+00   5.615 2.47e-08 ***
## SaleConditionAdjLand  1.637e+04  1.449e+04   1.130 0.258838    
## SaleConditionAlloca   8.114e+03  9.453e+03   0.858 0.390895    
## SaleConditionFamily  -5.331e+03  7.124e+03  -0.748 0.454430    
## SaleConditionNormal   8.044e+03  3.170e+03   2.537 0.011299 *  
## SaleConditionPartial  2.883e+04  4.306e+03   6.695 3.39e-11 ***
## BsmtUnfSF             1.731e+01  3.754e+00   4.612 4.44e-06 ***
## BsmtFinSF2            2.684e+01  6.195e+00   4.333 1.60e-05 ***
## EnclosedPorch        -2.291e+01  1.408e+01  -1.627 0.103952    
## ExterQualFa          -4.397e+04  1.051e+04  -4.184 3.09e-05 ***
## ExterQualGd          -4.094e+04  5.508e+03  -7.432 2.09e-13 ***
## ExterQualTA          -4.594e+04  6.254e+03  -7.345 3.91e-13 ***
## X3SsnPorch            1.420e+01  2.733e+01   0.520 0.603444    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26890 on 1137 degrees of freedom
##   (259 observations deleted due to missingness)
## Multiple R-squared:  0.9015, Adjusted R-squared:  0.896 
## F-statistic: 165.1 on 63 and 1137 DF,  p-value: < 2.2e-16

#forming our tester for the kaggle competition
pred_price<-predict(aic_lm,newdata = house_test)

#For any NA prediction, let's fill in with the closest estimated mean
#My score was 0.21!!!!! 
ans<-data.frame(Id=house_test[,1],SalePrice=pred_price)
ans[is.na(ans)]=con95$estimate
write.csv(ans,file="Final_kaggleVH.csv")

My username is vylvspebs

Data 605 Final

Vyanna Hill

2022-12-26

Part 1

Part 2

Part 3