Email: vanessasupit0910@gmail.com
Linkedin: https://www.linkedin.com/in/your-account/
RPubs: https://rpubs.com/Vanessasupit/
The aim of this report is to apply Exploratory Data Analysis (EDA) to the house sales in King County, Washington State, USA. The data set consisted of historic data of houses sold between May 2014 to May 2015.
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## date = col_character()
## )
## i Use `spec()` for the full column specifications.
## cols(
## id = col_double(),
## date = col_character(),
## price = col_double(),
## bedrooms = col_double(),
## bathrooms = col_double(),
## sqft_living = col_double(),
## sqft_lot = col_double(),
## floors = col_double(),
## waterfront = col_double(),
## view = col_double(),
## condition = col_double(),
## grade = col_double(),
## sqft_above = col_double(),
## sqft_basement = col_double(),
## yr_built = col_double(),
## yr_renovated = col_double(),
## zipcode = col_double(),
## lat = col_double(),
## long = col_double(),
## sqft_living15 = col_double(),
## sqft_lot15 = col_double()
## )
## integer(0)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## 1 2 3 4 5
## 29 170 14020 5677 1701
##
## 3 4 5 6 7 8
## 4.630273e-05 1.250174e-03 1.120526e-02 9.436496e-02 4.155207e-01 2.808260e-01
## 9 10 11 12 13
## 1.210816e-01 5.250729e-02 1.847479e-02 4.120943e-03 6.019355e-04
## [1] "id" "price" "bedrooms" "bathrooms"
## [5] "sqft_living" "sqft_lot" "floors" "waterfront"
## [9] "view" "condition" "grade" "sqft_above"
## [13] "sqft_basement" "yr_built" "yr_renovated" "zipcode"
## [17] "lat" "long" "sqft_living15" "sqft_lot15"
## id price bedrooms bathrooms
## Min. :1.000e+06 Min. : 78000 Min. : 1.000 Min. :0.500
## 1st Qu.:2.123e+09 1st Qu.: 322000 1st Qu.: 3.000 1st Qu.:1.750
## Median :3.905e+09 Median : 450000 Median : 3.000 Median :2.250
## Mean :4.580e+09 Mean : 540297 Mean : 3.373 Mean :2.116
## 3rd Qu.:7.309e+09 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500
## Max. :9.900e+09 Max. :7700000 Max. :33.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 370 Min. : 520 Min. :1.000 Min. :0.000000
## 1st Qu.: 1430 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1910 Median : 7618 Median :1.500 Median :0.000000
## Mean : 2080 Mean : 15099 Mean :1.494 Mean :0.007547
## 3rd Qu.: 2550 3rd Qu.: 10685 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1651359 Max. :3.500 Max. :1.000000
## view condition grade sqft_above
## Min. :0.0000 Min. :1.00 Min. : 3.000 Min. : 370
## 1st Qu.:0.0000 1st Qu.:3.00 1st Qu.: 7.000 1st Qu.:1190
## Median :0.0000 Median :3.00 Median : 7.000 Median :1560
## Mean :0.2343 Mean :3.41 Mean : 7.658 Mean :1789
## 3rd Qu.:0.0000 3rd Qu.:4.00 3rd Qu.: 8.000 3rd Qu.:2210
## Max. :4.0000 Max. :5.00 Max. :13.000 Max. :9410
## sqft_basement yr_built yr_renovated zipcode
## Min. : 0.0 Min. :1900 Min. : 0.00 Min. :98001
## 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.00 1st Qu.:98033
## Median : 0.0 Median :1975 Median : 0.00 Median :98065
## Mean : 291.7 Mean :1971 Mean : 84.46 Mean :98078
## 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.00 3rd Qu.:98118
## Max. :4820.0 Max. :2015 Max. :2015.00 Max. :98199
## lat long sqft_living15 sqft_lot15
## Min. :47.16 Min. :-122.5 Min. : 399 Min. : 651
## 1st Qu.:47.47 1st Qu.:-122.3 1st Qu.:1490 1st Qu.: 5100
## Median :47.57 Median :-122.2 Median :1840 Median : 7620
## Mean :47.56 Mean :-122.2 Mean :1987 Mean : 12758
## 3rd Qu.:47.68 3rd Qu.:-122.1 3rd Qu.:2360 3rd Qu.: 10083
## Max. :47.78 Max. :-121.3 Max. :6210 Max. :871200
## [1] 0.7881271
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x Hmisc::src() masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
library(Hmisc)
library(skimr)
basic_eda <- function(data)
{
glimpse(data)
skim(data)
df_status(data)
freq(data)
profiling_num(data)
plot_num(data)
describe(data)
}
basic_eda(df)
## Rows: 21,597
## Columns: 21
## $ id <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 19544...
## $ date <chr> "10/13/2014", "12/9/2014", "2/25/2015", "12/9/2014", ...
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 2575...
## $ bedrooms <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4,...
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00,...
## $ sqft_living <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, ...
## $ sqft_lot <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 74...
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0...
## $ waterfront <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,...
## $ condition <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4,...
## $ grade <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7...
## $ sqft_above <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, ...
## $ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, ...
## $ yr_built <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960,...
## $ yr_renovated <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ zipcode <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 9819...
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561,...
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -12...
## $ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780,...
## $ sqft_lot15 <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 811...
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 id 0 0.00 0 0 0 0 numeric 21420
## 2 date 0 0.00 0 0 0 0 character 372
## 3 price 0 0.00 0 0 0 0 numeric 3622
## 4 bedrooms 0 0.00 0 0 0 0 numeric 12
## 5 bathrooms 0 0.00 0 0 0 0 numeric 29
## 6 sqft_living 0 0.00 0 0 0 0 numeric 1034
## 7 sqft_lot 0 0.00 0 0 0 0 numeric 9776
## 8 floors 0 0.00 0 0 0 0 numeric 6
## 9 waterfront 21434 99.25 0 0 0 0 numeric 2
## 10 view 19475 90.17 0 0 0 0 numeric 5
## 11 condition 0 0.00 0 0 0 0 numeric 5
## 12 grade 0 0.00 0 0 0 0 numeric 11
## 13 sqft_above 0 0.00 0 0 0 0 numeric 942
## 14 sqft_basement 13110 60.70 0 0 0 0 numeric 306
## 15 yr_built 0 0.00 0 0 0 0 numeric 116
## 16 yr_renovated 20683 95.77 0 0 0 0 numeric 70
## 17 zipcode 0 0.00 0 0 0 0 numeric 70
## 18 lat 0 0.00 0 0 0 0 numeric 5033
## 19 long 0 0.00 0 0 0 0 numeric 751
## 20 sqft_living15 0 0.00 0 0 0 0 numeric 777
## 21 sqft_lot15 0 0.00 0 0 0 0 numeric 8682
## Warning in freq_logic(data = data, input = input, plot, na.rm, path_out =
## path_out): Skipping plot for variable 'date' (more than 100 categories)
## data
##
## 21 Variables 21597 Observations
## --------------------------------------------------------------------------------
## id
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 21420 1 4.58e+09 3.297e+09 5.127e+08 1.036e+09
## .25 .50 .75 .90 .95
## 2.123e+09 3.905e+09 7.309e+09 8.732e+09 9.297e+09
##
## lowest : 1000102 1200019 1200021 2800031 3600057
## highest: 9842300095 9842300485 9842300540 9895000040 9900000190
## --------------------------------------------------------------------------------
## date
## n missing distinct
## 21597 0 372
##
## lowest : 1/10/2015 1/12/2015 1/13/2015 1/14/2015 1/15/2015
## highest: 9/5/2014 9/6/2014 9/7/2014 9/8/2014 9/9/2014
## --------------------------------------------------------------------------------
## price
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 3622 1 540297 329526 210000 245000
## .25 .50 .75 .90 .95
## 322000 450000 645000 887000 1160000
##
## lowest : 78000 80000 81000 82000 82500
## highest: 5350000 5570000 6890000 7060000 7700000
## --------------------------------------------------------------------------------
## bedrooms
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 12 0.871 3.373 0.9427 2 2
## .25 .50 .75 .90 .95
## 3 3 4 4 5
##
## lowest : 1 2 3 4 5, highest: 8 9 10 11 33
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 196 2760 9824 6882 1601 272 38 13 6 3 1
## Proportion 0.009 0.128 0.455 0.319 0.074 0.013 0.002 0.001 0.000 0.000 0.000
##
## Value 33
## Frequency 1
## Proportion 0.000
## --------------------------------------------------------------------------------
## bathrooms
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 29 0.974 2.116 0.8432 1.00 1.00
## .25 .50 .75 .90 .95
## 1.75 2.25 2.50 3.00 3.50
##
## lowest : 0.50 0.75 1.00 1.25 1.50, highest: 6.50 6.75 7.50 7.75 8.00
## --------------------------------------------------------------------------------
## sqft_living
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 1034 1 2080 978 940 1090
## .25 .50 .75 .90 .95
## 1430 1910 2550 3254 3760
##
## lowest : 370 380 390 410 420, highest: 9640 9890 10040 12050 13540
## --------------------------------------------------------------------------------
## sqft_lot
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 9776 1 15099 17841 1801 3323
## .25 .50 .75 .90 .95
## 5040 7618 10685 21372 43307
##
## lowest : 520 572 600 609 635
## highest: 982998 1024068 1074218 1164794 1651359
## --------------------------------------------------------------------------------
## floors
## n missing distinct Info Mean Gmd
## 21597 0 6 0.823 1.494 0.5561
##
## lowest : 1.0 1.5 2.0 2.5 3.0, highest: 1.5 2.0 2.5 3.0 3.5
##
## Value 1.0 1.5 2.0 2.5 3.0 3.5
## Frequency 10673 1910 8235 161 611 7
## Proportion 0.494 0.088 0.381 0.007 0.028 0.000
## --------------------------------------------------------------------------------
## waterfront
## n missing distinct Info Sum Mean Gmd
## 21597 0 2 0.022 163 0.007547 0.01498
##
## --------------------------------------------------------------------------------
## view
## n missing distinct Info Mean Gmd
## 21597 0 5 0.267 0.2343 0.4322
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 19475 332 961 510 319
## Proportion 0.902 0.015 0.044 0.024 0.015
## --------------------------------------------------------------------------------
## condition
## n missing distinct Info Mean Gmd
## 21597 0 5 0.708 3.41 0.6159
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 29 170 14020 5677 1701
## Proportion 0.001 0.008 0.649 0.263 0.079
## --------------------------------------------------------------------------------
## grade
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 11 0.903 7.658 1.229 6 6
## .25 .50 .75 .90 .95
## 7 7 8 9 10
##
## lowest : 3 4 5 6 7, highest: 9 10 11 12 13
##
## Value 3 4 5 6 7 8 9 10 11 12 13
## Frequency 1 27 242 2038 8974 6065 2615 1134 399 89 13
## Proportion 0.000 0.001 0.011 0.094 0.416 0.281 0.121 0.053 0.018 0.004 0.001
## --------------------------------------------------------------------------------
## sqft_above
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 942 1 1789 875.8 850 970
## .25 .50 .75 .90 .95
## 1190 1560 2210 2950 3400
##
## lowest : 370 380 390 410 420, highest: 7880 8020 8570 8860 9410
## --------------------------------------------------------------------------------
## sqft_basement
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 306 0.776 291.7 422.4 0 0
## .25 .50 .75 .90 .95
## 0 0 560 970 1190
##
## lowest : 0 10 20 40 50, highest: 3260 3480 3500 4130 4820
## --------------------------------------------------------------------------------
## yr_built
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 116 1 1971 33.38 1915 1926
## .25 .50 .75 .90 .95
## 1951 1975 1997 2007 2011
##
## lowest : 1900 1901 1902 1903 1904, highest: 2011 2012 2013 2014 2015
## --------------------------------------------------------------------------------
## yr_renovated
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 70 0.122 84.46 161.8 0 0
## .25 .50 .75 .90 .95
## 0 0 0 0 0
##
## lowest : 0 1934 1940 1944 1945, highest: 2011 2012 2013 2014 2015
##
## Value 0 1935 1940 1945 1950 1955 1960 1965 1970 1975 1980
## Frequency 20683 1 2 6 4 13 12 16 27 25 43
## Proportion 0.958 0.000 0.000 0.000 0.000 0.001 0.001 0.001 0.001 0.001 0.002
##
## Value 1985 1990 1995 2000 2005 2010 2015
## Frequency 88 99 84 112 156 82 144
## Proportion 0.004 0.005 0.004 0.005 0.007 0.004 0.007
##
## For the frequency table, variable is rounded to the nearest 5
## --------------------------------------------------------------------------------
## zipcode
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 70 1 98078 60.78 98004 98008
## .25 .50 .75 .90 .95
## 98033 98065 98118 98155 98177
##
## lowest : 98001 98002 98003 98004 98005, highest: 98177 98178 98188 98198 98199
## --------------------------------------------------------------------------------
## lat
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 5033 1 47.56 0.1573 47.31 47.35
## .25 .50 .75 .90 .95
## 47.47 47.57 47.68 47.73 47.75
##
## lowest : 47.1559 47.1593 47.1622 47.1647 47.1764
## highest: 47.7771 47.7772 47.7774 47.7775 47.7776
## --------------------------------------------------------------------------------
## long
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 751 1 -122.2 0.1557 -122.4 -122.4
## .25 .50 .75 .90 .95
## -122.3 -122.2 -122.1 -122.0 -122.0
##
## lowest : -122.519 -122.515 -122.514 -122.512 -122.511
## highest: -121.325 -121.321 -121.319 -121.316 -121.315
## --------------------------------------------------------------------------------
## sqft_living15
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 777 1 1987 743.1 1140 1258
## .25 .50 .75 .90 .95
## 1490 1840 2360 2930 3300
##
## lowest : 399 460 620 670 690, highest: 5600 5610 5790 6110 6210
## --------------------------------------------------------------------------------
## sqft_lot15
## n missing distinct Info Mean Gmd .05 .10
## 21597 0 8682 1 12758 13385 2002 3668
## .25 .50 .75 .90 .95
## 5100 7620 10083 17822 37045
##
## lowest : 651 659 660 748 750, highest: 434728 438213 560617 858132 871200
## --------------------------------------------------------------------------------