Part 1: Read the data..

# reading external data and storing into a dataframe called "df"
df = read.csv("home_data.csv")
attach(df)

Part 2: Save to rda file

# save dataframe to rda file
save(df, file = "home_data.rda")

Part 3: Column names

# Display the column names
colnames(df)
##  [1] "id"            "date"          "price"         "bedrooms"     
##  [5] "bathrooms"     "sqft_living"   "sqft_lot"      "floors"       
##  [9] "waterfront"    "view"          "condition"     "grade"        
## [13] "sqft_above"    "sqft_basement" "yr_built"      "yr_renovated" 
## [17] "zipcode"       "lat"           "long"          "sqft_living15"
## [21] "sqft_lot15"

Part 4: Data Dimensions

# Display the Data Dimensions
dim(df)
## [1] 21613    21

Part 5: Data Structure

# data structure 
str(df)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : int  221900 538000 180000 604000 510000 1225000 257500 291850 229500 323000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...

Part 6: Summary statistics

# Summary statistics

#Y Variable - price
summary(price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   75000  321950  450000  540088  645000 7700000
#number of bedrooms
summary(bedrooms)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   3.000   3.371   4.000  33.000
#Square feet of living area
summary(sqft_living15)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     399    1490    1840    1987    2360    6210
#Square feet of lot area
summary(sqft_lot15)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     651    5100    7620   12768   10083  871200
#year the house was built
summary(yr_built)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1900    1951    1975    1971    1997    2015
#year the house was renovated
summary(yr_renovated)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0    84.4     0.0  2015.0

Part 7: Exhaustive Summary

# loading the package
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
# summary using describe command

#Y Variable - price
describe(price)
##    vars     n     mean       sd median trimmed    mad   min     max
## X1    1 21613 540088.1 367127.2 450000  481704 222390 75000 7700000
##      range skew kurtosis      se
## X1 7625000 4.02    34.57 2497.23
#number of bedrooms
describe(bedrooms)
##    vars     n mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 21613 3.37 0.93      3    3.34 1.48   0  33    33 1.97    49.05
##      se
## X1 0.01
#Square feet of living area
describe(sqft_living15)
##    vars     n    mean     sd median trimmed    mad min  max range skew
## X1    1 21613 1986.55 685.39   1840 1914.07 607.87 399 6210  5811 1.11
##    kurtosis   se
## X1      1.6 4.66
#Square feet of lot area
describe(sqft_lot15)
##    vars     n     mean       sd median trimmed     mad min    max  range
## X1    1 21613 12768.46 27304.18   7620 7903.21 3713.91 651 871200 870549
##    skew kurtosis     se
## X1 9.51   150.71 185.73
#year the house was built
describe(yr_built)
##    vars     n    mean    sd median trimmed  mad  min  max range  skew
## X1    1 21613 1971.01 29.37   1975  1973.1 34.1 1900 2015   115 -0.47
##    kurtosis  se
## X1    -0.66 0.2
#year the house was renovated
describe(yr_renovated)
##    vars     n mean     sd median trimmed mad min  max range skew kurtosis
## X1    1 21613 84.4 401.68      0       0   0   0 2015  2015 4.55    18.69
##      se
## X1 2.73