house<-read.table("train.csv",sep = ",",header = T)
colnames(house)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
colSums(sapply(house, is.na))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
sapply(house, function(x) sum(is.na(x))) ### few col with too much NA
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
house1<- house[,-c(7,58,73,74,75)] ## removing the col with too much NA
sapply(house1, function(x) sum(is.na(x)))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##             0             0             0             0             0 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##             0             0             0             0             0 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##             0             0             0             0             0 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##             0             0             0             0             8 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##             8             0             0             0            37 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##            37            38            37             0            38 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##             0             0             0             0             0 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##             0             1             0             0             0 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##             0             0             0             0             0 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##             0             0             0             0             0 
##    Fireplaces    GarageType   GarageYrBlt  GarageFinish    GarageCars 
##             0            81            81            81             0 
##    GarageArea    GarageQual    GarageCond    PavedDrive    WoodDeckSF 
##             0            81            81             0             0 
##   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea 
##             0             0             0             0             0 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
Na<-data.frame(colMeans(is.na(house)*100))###display the % of NA in each column 
housemean<-house1
house1<- na.omit(house1)


housemean$LotFrontage[is.na(house$LotFrontage)]<-
  mean(house$LotFrontage,na.rm= T) ##change NA to mean in lotfrontage
###### housemean is the data we use as it exchange the NAs to mean 
sapply(housemean, function(x) sum(is.na(x))/nrow(housemean)*100) ##% of NA in each col
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.54794521 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##    0.54794521    0.00000000    0.00000000    0.00000000    2.53424658 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##    2.53424658    2.60273973    2.53424658    0.00000000    2.60273973 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##    0.00000000    0.06849315    0.00000000    0.00000000    0.00000000 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##    Fireplaces    GarageType   GarageYrBlt  GarageFinish    GarageCars 
##    0.00000000    5.54794521    5.54794521    5.54794521    0.00000000 
##    GarageArea    GarageQual    GarageCond    PavedDrive    WoodDeckSF 
##    0.00000000    5.54794521    5.54794521    0.00000000    0.00000000 
##   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##     SalePrice 
##    0.00000000
housemean<- na.omit(housemean)  ###removing NA from the data without NA cols
###is.numeric(house$LotFrontage)
any(is.na(housemean)) ######no NA in the data 
## [1] FALSE
########## getting rid of cater data 
sapply(housemean, is.factor)
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##         FALSE         FALSE          TRUE         FALSE         FALSE 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##          TRUE         FALSE         FALSE         FALSE         FALSE 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##         FALSE          TRUE          TRUE          TRUE          TRUE 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##          TRUE          TRUE          TRUE         FALSE          TRUE 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##         FALSE         FALSE         FALSE          TRUE          TRUE 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##          TRUE          TRUE         FALSE         FALSE         FALSE 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##         FALSE         FALSE         FALSE         FALSE         FALSE 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##         FALSE         FALSE          TRUE         FALSE          TRUE 
##    Fireplaces    GarageType   GarageYrBlt  GarageFinish    GarageCars 
##         FALSE          TRUE         FALSE          TRUE         FALSE 
##    GarageArea    GarageQual    GarageCond    PavedDrive    WoodDeckSF 
##         FALSE          TRUE          TRUE          TRUE         FALSE 
##   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea 
##         FALSE         FALSE         FALSE         FALSE         FALSE 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##         FALSE         FALSE         FALSE          TRUE          TRUE 
##     SalePrice 
##         FALSE
str(housemean)
## 'data.frame':    1338 obs. of  76 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : num  65 80 68 60 84 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:122] 18 40 49 79 89 90 91 100 103 109 ...
##   .. ..- attr(*, "names")= chr [1:122] "18" "40" "49" "79" ...
housemeannum<- housemean[, sapply(housemean, class) != "factor"]
str(housemeannum)
## 'data.frame':    1338 obs. of  38 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ LotFrontage  : num  65 80 68 60 84 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
housemeannum<- housemeannum[,-1]
  #####make a better name to work with ( or not )
###########PCA
###house1 <- data.frame(lapply(house, function(x) as.numeric(x))) #### convert all variables in to numeric
ggpairs(house[,1:10])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###lets start the cheeky pca####################################
pca1<-prcomp(housemeannum, scale. = F)
#pca2<- princomp(housemeannum) ##different pca method
##pca3<-prcomp(house1, cor=T)
summary(pca1)
## Importance of components:
##                              PC1       PC2       PC3       PC4       PC5
## Standard deviation     78960.565 9.991e+03 592.24300 542.48130 506.72642
## Proportion of Variance     0.984 1.575e-02   0.00006   0.00005   0.00004
## Cumulative Proportion      0.984 9.998e-01   0.99986   0.99991   0.99995
##                              PC6       PC7   PC8   PC9  PC10  PC11  PC12
## Standard deviation     435.35212 208.05459 161.3 144.8 141.6 120.5 63.72
## Proportion of Variance   0.00003   0.00001   0.0   0.0   0.0   0.0  0.00
## Cumulative Proportion    0.99998   0.99998   1.0   1.0   1.0   1.0  1.00
##                        PC13  PC14  PC15  PC16  PC17  PC18  PC19  PC20
## Standard deviation     57.5 55.46 46.22 40.22 38.82 29.98 25.66 17.51
## Proportion of Variance  0.0  0.00  0.00  0.00  0.00  0.00  0.00  0.00
## Cumulative Proportion   1.0  1.00  1.00  1.00  1.00  1.00  1.00  1.00
##                         PC21  PC22  PC23  PC24   PC25   PC26   PC27 PC28
## Standard deviation     13.58 9.896 2.687 1.301 0.9197 0.8493 0.7222 0.53
## Proportion of Variance  0.00 0.000 0.000 0.000 0.0000 0.0000 0.0000 0.00
## Cumulative Proportion   1.00 1.000 1.000 1.000 1.0000 1.0000 1.0000 1.00
##                          PC29   PC30   PC31   PC32   PC33   PC34   PC35
## Standard deviation     0.4828 0.4129 0.3779 0.3155 0.2697 0.2148 0.1425
## Proportion of Variance 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion  1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
##                             PC36      PC37
## Standard deviation     7.729e-12 7.729e-12
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00
#names(pca2)
#pca2$center
#summary(pca2)
#pca2$loadings
#plot(pca2) 
plot(pca1)### the frist comp is dominating, so we may consider to keep the first comp 

#plot(pca2,type="l") 
biplot(pca1)

plot(pca1,type="l")

#biplot(pca2)
###slightly different between 
####
head(pca1$x)
##          PC1        PC2        PC3       PC4        PC5        PC6
## 1  21648.840  2993.4692  261.61051 -765.7714 -151.91894  13.599403
## 2  -5296.591   922.4296 -740.57014 -206.5700  -49.47873 -17.426511
## 3  36735.381   705.5634  463.01666 -452.6203 -119.18314 -67.583796
## 4 -46773.597  -427.7323  579.51618 -174.3162  -81.46437 294.865987
## 5  63326.153 -1407.4602  495.79976 -436.7607 -108.13752 337.692687
## 6 -43622.702 -4892.1475   38.35393 -787.1895  533.53842  -6.487104
##          PC7        PC8       PC9       PC10       PC11        PC12
## 1 -130.04435   59.92000  58.41562  21.803505  105.69278    9.665286
## 2  -61.56814 -131.32812  28.52052  -5.088526 -201.92717  -23.866109
## 3 -117.12382   33.84081  65.83445  67.046695  110.04076   -7.244298
## 4  -11.73176  -33.40572 -54.79600 217.201531  104.52490 -178.649065
## 5 -159.42587  204.87907  45.00762 195.813155  -46.15079    1.220313
## 6 -129.66792  -50.99795  65.77797  76.940871   46.19438    7.356032
##         PC13        PC14       PC15       PC16       PC17        PC18
## 1   2.772538   37.449094 -0.9057268  1.6566416 -13.102007    5.314139
## 2 -27.919046    2.246281 -0.6972327 12.7959128 -30.930507    4.300160
## 3 -17.985637   40.720787 -0.4508918 -0.4760527  -9.622396    6.166468
## 4  92.560499 -110.548444 40.2684952 11.6696908  25.700747   -4.409205
## 5   2.846396   13.777530  6.0163292  9.2637158 -18.172048    3.365736
## 6 -14.905794   51.523328 -2.2525060  6.1372625 -38.409081 -314.555500
##         PC19        PC20       PC21        PC22      PC23       PC24
## 1 -31.655653  -8.7799006 -1.1674496   0.1060841  4.192466  0.2277685
## 2   3.760255   7.6523019  7.0133982  -0.1618263  1.259153  1.0472452
## 3 -22.443995  -7.1925108 -0.4730673   1.9234043 -2.742188 -0.5303354
## 4 -16.404688 -16.4222158  2.0899651 -38.4573562  3.897706  2.1182664
## 5  -6.266226   0.3580792 -1.3237317   6.0446826 -5.546401 -0.7201117
## 6 -24.471582   1.9253399  2.5065831   1.5303373 -2.882676 -1.0570371
##         PC25       PC26       PC27        PC28       PC29        PC30
## 1 -0.8694377  0.4186290  0.4727051 -0.34835994  0.5621131 -0.03955777
## 2 -0.4385096 -2.5343754  0.1108975  0.08292793 -0.4998716  0.77399197
## 3  1.1959238  0.1994161 -0.1483062  0.16943122 -0.4394287 -0.25585111
## 4  0.5010191  0.6503217  1.6741537  0.36393350 -0.3608919 -0.29275198
## 5 -0.8062699  0.1155160  0.7627400  0.02716354 -0.1398094 -0.34288163
## 6  1.2210664  0.9210174 -0.7080342  0.39819125  1.4187762 -0.16926417
##         PC31        PC32       PC33        PC34         PC35          PC36
## 1  0.3182303 -0.02040291  0.2061970  0.15291521 -0.038191003 -5.989031e-13
## 2 -0.7391459  0.36720016 -0.0764568  0.64105355 -0.008653881 -8.936246e-14
## 3  0.5098453 -0.16591396  0.2788568  0.20451842  0.066781248  5.030403e-14
## 4  0.8255804  0.67225783 -0.7458133  0.04604920 -0.092117620  6.460891e-14
## 5  0.4167688  0.07732564 -0.1087916  0.01916855 -0.086123545 -7.042504e-13
## 6  0.2295990  0.28555315 -0.1756015 -0.04846361  0.013104148  2.014629e-11
##            PC37
## 1  5.237542e-13
## 2  5.965345e-14
## 3  1.619347e-13
## 4  5.127376e-13
## 5  3.108748e-13
## 6 -2.603306e-13
# create data frame with scores
scores <- as.data.frame(pca1$x)
rotation<-pca1$rotation
# plot of observations
#ggplot(data = scores, aes(x = PC1, y = PC2, label = rownames(scores))) +
  #geom_hline(yintercept = 0, colour = "gray65") +
  #geom_vline(xintercept = 0, colour = "gray65") +
  #geom_text(colour = "tomato", alpha = 0.5, size = 4) +
  #ggtitle("PCA plot house price")  
housefactor<-Filter(is.factor,housemean)
finalhouse<- data.frame(scores[,1],housefactor,housemean$SalePrice)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.