house<-read.table("train.csv",sep = ",",header = T)
colnames(house)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
colSums(sapply(house, is.na))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
sapply(house, function(x) sum(is.na(x))) ### few col with too much NA
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
house1<- house[,-c(7,58,73,74,75)] ## removing the col with too much NA
sapply(house1, function(x) sum(is.na(x)))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street LotShape LandContour Utilities LotConfig
## 0 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 0 0 8
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 8 0 0 0 37
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 37 38 37 0 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 1 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 0 0 0 0
## Fireplaces GarageType GarageYrBlt GarageFinish GarageCars
## 0 81 81 81 0
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 0 81 81 0 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## 0 0 0 0 0
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
Na<-data.frame(colMeans(is.na(house)*100))###display the % of NA in each column
housemean<-house1
house1<- na.omit(house1)
housemean$LotFrontage[is.na(house$LotFrontage)]<-
mean(house$LotFrontage,na.rm= T) ##change NA to mean in lotfrontage
###### housemean is the data we use as it exchange the NAs to mean
sapply(housemean, function(x) sum(is.na(x))/nrow(housemean)*100) ##% of NA in each col
## Id MSSubClass MSZoning LotFrontage LotArea
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## Street LotShape LandContour Utilities LotConfig
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0.00000000 0.00000000 0.00000000 0.00000000 0.54794521
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 0.54794521 0.00000000 0.00000000 0.00000000 2.53424658
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 2.53424658 2.60273973 2.53424658 0.00000000 2.60273973
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0.00000000 0.06849315 0.00000000 0.00000000 0.00000000
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## Fireplaces GarageType GarageYrBlt GarageFinish GarageCars
## 0.00000000 5.54794521 5.54794521 5.54794521 0.00000000
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 0.00000000 5.54794521 5.54794521 0.00000000 0.00000000
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## MiscVal MoSold YrSold SaleType SaleCondition
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## SalePrice
## 0.00000000
housemean<- na.omit(housemean) ###removing NA from the data without NA cols
###is.numeric(house$LotFrontage)
any(is.na(housemean)) ######no NA in the data
## [1] FALSE
########## getting rid of cater data
sapply(housemean, is.factor)
## Id MSSubClass MSZoning LotFrontage LotArea
## FALSE FALSE TRUE FALSE FALSE
## Street LotShape LandContour Utilities LotConfig
## TRUE TRUE TRUE TRUE TRUE
## LandSlope Neighborhood Condition1 Condition2 BldgType
## TRUE TRUE TRUE TRUE TRUE
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## TRUE FALSE FALSE FALSE FALSE
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## TRUE TRUE TRUE TRUE TRUE
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## FALSE TRUE TRUE TRUE TRUE
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## TRUE TRUE TRUE FALSE TRUE
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## FALSE FALSE FALSE TRUE TRUE
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## TRUE TRUE FALSE FALSE FALSE
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## FALSE FALSE FALSE FALSE FALSE
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## FALSE FALSE TRUE FALSE TRUE
## Fireplaces GarageType GarageYrBlt GarageFinish GarageCars
## FALSE TRUE FALSE TRUE FALSE
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## FALSE TRUE TRUE TRUE FALSE
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## FALSE FALSE FALSE FALSE FALSE
## MiscVal MoSold YrSold SaleType SaleCondition
## FALSE FALSE FALSE TRUE TRUE
## SalePrice
## FALSE
str(housemean)
## 'data.frame': 1338 obs. of 76 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : num 65 80 68 60 84 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:122] 18 40 49 79 89 90 91 100 103 109 ...
## .. ..- attr(*, "names")= chr [1:122] "18" "40" "49" "79" ...
housemeannum<- housemean[, sapply(housemean, class) != "factor"]
str(housemeannum)
## 'data.frame': 1338 obs. of 38 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ LotFrontage : num 65 80 68 60 84 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
housemeannum<- housemeannum[,-1]
#####make a better name to work with ( or not )
###########PCA
###house1 <- data.frame(lapply(house, function(x) as.numeric(x))) #### convert all variables in to numeric
ggpairs(house[,1:10])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###lets start the cheeky pca####################################
pca1<-prcomp(housemeannum, scale. = F)
#pca2<- princomp(housemeannum) ##different pca method
##pca3<-prcomp(house1, cor=T)
summary(pca1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 78960.565 9.991e+03 592.24300 542.48130 506.72642
## Proportion of Variance 0.984 1.575e-02 0.00006 0.00005 0.00004
## Cumulative Proportion 0.984 9.998e-01 0.99986 0.99991 0.99995
## PC6 PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 435.35212 208.05459 161.3 144.8 141.6 120.5 63.72
## Proportion of Variance 0.00003 0.00001 0.0 0.0 0.0 0.0 0.00
## Cumulative Proportion 0.99998 0.99998 1.0 1.0 1.0 1.0 1.00
## PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 57.5 55.46 46.22 40.22 38.82 29.98 25.66 17.51
## Proportion of Variance 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## Cumulative Proportion 1.0 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 13.58 9.896 2.687 1.301 0.9197 0.8493 0.7222 0.53
## Proportion of Variance 0.00 0.000 0.000 0.000 0.0000 0.0000 0.0000 0.00
## Cumulative Proportion 1.00 1.000 1.000 1.000 1.0000 1.0000 1.0000 1.00
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.4828 0.4129 0.3779 0.3155 0.2697 0.2148 0.1425
## Proportion of Variance 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## PC36 PC37
## Standard deviation 7.729e-12 7.729e-12
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00
#names(pca2)
#pca2$center
#summary(pca2)
#pca2$loadings
#plot(pca2)
plot(pca1)### the frist comp is dominating, so we may consider to keep the first comp
#plot(pca2,type="l")
biplot(pca1)
plot(pca1,type="l")
#biplot(pca2)
###slightly different between
####
head(pca1$x)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 21648.840 2993.4692 261.61051 -765.7714 -151.91894 13.599403
## 2 -5296.591 922.4296 -740.57014 -206.5700 -49.47873 -17.426511
## 3 36735.381 705.5634 463.01666 -452.6203 -119.18314 -67.583796
## 4 -46773.597 -427.7323 579.51618 -174.3162 -81.46437 294.865987
## 5 63326.153 -1407.4602 495.79976 -436.7607 -108.13752 337.692687
## 6 -43622.702 -4892.1475 38.35393 -787.1895 533.53842 -6.487104
## PC7 PC8 PC9 PC10 PC11 PC12
## 1 -130.04435 59.92000 58.41562 21.803505 105.69278 9.665286
## 2 -61.56814 -131.32812 28.52052 -5.088526 -201.92717 -23.866109
## 3 -117.12382 33.84081 65.83445 67.046695 110.04076 -7.244298
## 4 -11.73176 -33.40572 -54.79600 217.201531 104.52490 -178.649065
## 5 -159.42587 204.87907 45.00762 195.813155 -46.15079 1.220313
## 6 -129.66792 -50.99795 65.77797 76.940871 46.19438 7.356032
## PC13 PC14 PC15 PC16 PC17 PC18
## 1 2.772538 37.449094 -0.9057268 1.6566416 -13.102007 5.314139
## 2 -27.919046 2.246281 -0.6972327 12.7959128 -30.930507 4.300160
## 3 -17.985637 40.720787 -0.4508918 -0.4760527 -9.622396 6.166468
## 4 92.560499 -110.548444 40.2684952 11.6696908 25.700747 -4.409205
## 5 2.846396 13.777530 6.0163292 9.2637158 -18.172048 3.365736
## 6 -14.905794 51.523328 -2.2525060 6.1372625 -38.409081 -314.555500
## PC19 PC20 PC21 PC22 PC23 PC24
## 1 -31.655653 -8.7799006 -1.1674496 0.1060841 4.192466 0.2277685
## 2 3.760255 7.6523019 7.0133982 -0.1618263 1.259153 1.0472452
## 3 -22.443995 -7.1925108 -0.4730673 1.9234043 -2.742188 -0.5303354
## 4 -16.404688 -16.4222158 2.0899651 -38.4573562 3.897706 2.1182664
## 5 -6.266226 0.3580792 -1.3237317 6.0446826 -5.546401 -0.7201117
## 6 -24.471582 1.9253399 2.5065831 1.5303373 -2.882676 -1.0570371
## PC25 PC26 PC27 PC28 PC29 PC30
## 1 -0.8694377 0.4186290 0.4727051 -0.34835994 0.5621131 -0.03955777
## 2 -0.4385096 -2.5343754 0.1108975 0.08292793 -0.4998716 0.77399197
## 3 1.1959238 0.1994161 -0.1483062 0.16943122 -0.4394287 -0.25585111
## 4 0.5010191 0.6503217 1.6741537 0.36393350 -0.3608919 -0.29275198
## 5 -0.8062699 0.1155160 0.7627400 0.02716354 -0.1398094 -0.34288163
## 6 1.2210664 0.9210174 -0.7080342 0.39819125 1.4187762 -0.16926417
## PC31 PC32 PC33 PC34 PC35 PC36
## 1 0.3182303 -0.02040291 0.2061970 0.15291521 -0.038191003 -5.989031e-13
## 2 -0.7391459 0.36720016 -0.0764568 0.64105355 -0.008653881 -8.936246e-14
## 3 0.5098453 -0.16591396 0.2788568 0.20451842 0.066781248 5.030403e-14
## 4 0.8255804 0.67225783 -0.7458133 0.04604920 -0.092117620 6.460891e-14
## 5 0.4167688 0.07732564 -0.1087916 0.01916855 -0.086123545 -7.042504e-13
## 6 0.2295990 0.28555315 -0.1756015 -0.04846361 0.013104148 2.014629e-11
## PC37
## 1 5.237542e-13
## 2 5.965345e-14
## 3 1.619347e-13
## 4 5.127376e-13
## 5 3.108748e-13
## 6 -2.603306e-13
# create data frame with scores
scores <- as.data.frame(pca1$x)
rotation<-pca1$rotation
# plot of observations
#ggplot(data = scores, aes(x = PC1, y = PC2, label = rownames(scores))) +
#geom_hline(yintercept = 0, colour = "gray65") +
#geom_vline(xintercept = 0, colour = "gray65") +
#geom_text(colour = "tomato", alpha = 0.5, size = 4) +
#ggtitle("PCA plot house price")
housefactor<-Filter(is.factor,housemean)
finalhouse<- data.frame(scores[,1],housefactor,housemean$SalePrice)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.