Part 1: Read the data..
# reading external data and storing into a dataframe called "df"
df = read.csv("home_data.csv")
attach(df)
Part 2: Save to rda file
# save dataframe to rda file
save(df, file = "home_data.rda")
Part 3: Column names
# Display the column names
colnames(df)
## [1] "id" "date" "price" "bedrooms"
## [5] "bathrooms" "sqft_living" "sqft_lot" "floors"
## [9] "waterfront" "view" "condition" "grade"
## [13] "sqft_above" "sqft_basement" "yr_built" "yr_renovated"
## [17] "zipcode" "lat" "long" "sqft_living15"
## [21] "sqft_lot15"
Part 4: Data Dimensions
# Display the Data Dimensions
dim(df)
## [1] 21613 21
Part 5: Data Structure
# data structure
str(df)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : int 221900 538000 180000 604000 510000 1225000 257500 291850 229500 323000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
Part 6: Summary statistics
# Summary statistics
#Y Variable - price
summary(price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 75000 321950 450000 540088 645000 7700000
#number of bedrooms
summary(bedrooms)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 3.000 3.371 4.000 33.000
#Square feet of living area
summary(sqft_living15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 399 1490 1840 1987 2360 6210
#Square feet of lot area
summary(sqft_lot15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 651 5100 7620 12768 10083 871200
#year the house was built
summary(yr_built)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1900 1951 1975 1971 1997 2015
#year the house was renovated
summary(yr_renovated)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 84.4 0.0 2015.0
Part 7: Exhaustive Summary
# loading the package
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
# summary using describe command
#Y Variable - price
describe(price)
## vars n mean sd median trimmed mad min max
## X1 1 21613 540088.1 367127.2 450000 481704 222390 75000 7700000
## range skew kurtosis se
## X1 7625000 4.02 34.57 2497.23
#number of bedrooms
describe(bedrooms)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 21613 3.37 0.93 3 3.34 1.48 0 33 33 1.97 49.05
## se
## X1 0.01
#Square feet of living area
describe(sqft_living15)
## vars n mean sd median trimmed mad min max range skew
## X1 1 21613 1986.55 685.39 1840 1914.07 607.87 399 6210 5811 1.11
## kurtosis se
## X1 1.6 4.66
#Square feet of lot area
describe(sqft_lot15)
## vars n mean sd median trimmed mad min max range
## X1 1 21613 12768.46 27304.18 7620 7903.21 3713.91 651 871200 870549
## skew kurtosis se
## X1 9.51 150.71 185.73
#year the house was built
describe(yr_built)
## vars n mean sd median trimmed mad min max range skew
## X1 1 21613 1971.01 29.37 1975 1973.1 34.1 1900 2015 115 -0.47
## kurtosis se
## X1 -0.66 0.2
#year the house was renovated
describe(yr_renovated)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 21613 84.4 401.68 0 0 0 0 2015 2015 4.55 18.69
## se
## X1 2.73