library(readr)
library(dplyr)
library(ggplot2)
library(funModeling)
house <- read_csv("/Users/Clevia/Documents/MLG/kc_house_data.csv")
summary(house)
## id date price bedrooms
## Min. :1.000e+06 Length:21597 Min. : 78000 Min. : 1.000
## 1st Qu.:2.123e+09 Class :character 1st Qu.: 322000 1st Qu.: 3.000
## Median :3.905e+09 Mode :character Median : 450000 Median : 3.000
## Mean :4.580e+09 Mean : 540297 Mean : 3.373
## 3rd Qu.:7.309e+09 3rd Qu.: 645000 3rd Qu.: 4.000
## Max. :9.900e+09 Max. :7700000 Max. :33.000
## bathrooms sqft_living sqft_lot floors
## Min. :0.500 Min. : 370 Min. : 520 Min. :1.000
## 1st Qu.:1.750 1st Qu.: 1430 1st Qu.: 5040 1st Qu.:1.000
## Median :2.250 Median : 1910 Median : 7618 Median :1.500
## Mean :2.116 Mean : 2080 Mean : 15099 Mean :1.494
## 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10685 3rd Qu.:2.000
## Max. :8.000 Max. :13540 Max. :1651359 Max. :3.500
## waterfront view condition grade
## Min. :0.000000 Min. :0.0000 Min. :1.00 Min. : 3.000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.00 1st Qu.: 7.000
## Median :0.000000 Median :0.0000 Median :3.00 Median : 7.000
## Mean :0.007547 Mean :0.2343 Mean :3.41 Mean : 7.658
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.00 3rd Qu.: 8.000
## Max. :1.000000 Max. :4.0000 Max. :5.00 Max. :13.000
## sqft_above sqft_basement yr_built yr_renovated
## Min. : 370 Min. : 0.0 Min. :1900 Min. : 0.00
## 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.00
## Median :1560 Median : 0.0 Median :1975 Median : 0.00
## Mean :1789 Mean : 291.7 Mean :1971 Mean : 84.46
## 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.00
## Max. :9410 Max. :4820.0 Max. :2015 Max. :2015.00
## zipcode lat long sqft_living15
## Min. :98001 Min. :47.16 Min. :-122.5 Min. : 399
## 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3 1st Qu.:1490
## Median :98065 Median :47.57 Median :-122.2 Median :1840
## Mean :98078 Mean :47.56 Mean :-122.2 Mean :1987
## 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1 3rd Qu.:2360
## Max. :98199 Max. :47.78 Max. :-121.3 Max. :6210
## sqft_lot15
## Min. : 651
## 1st Qu.: 5100
## Median : 7620
## Mean : 12758
## 3rd Qu.: 10083
## Max. :871200
glimpse(house)
## Rows: 21,597
## Columns: 21
## $ id <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 19544005~
## $ date <chr> "10/13/2014", "12/9/2014", "2/25/2015", "12/9/2014", "2/~
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,~
## $ bedrooms <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,~
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.~
## $ sqft_living <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189~
## $ sqft_lot <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,~
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1~
## $ waterfront <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,~
## $ condition <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,~
## $ grade <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7~
## $ sqft_above <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189~
## $ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, ~
## $ yr_built <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20~
## $ yr_renovated <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ zipcode <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, ~
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47~
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.0~
## $ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23~
## $ sqft_lot15 <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, ~
ggplot(data = house, aes(x = price)) +
geom_histogram(bins = 100, fill='blue', color ='black')

ggplot(data = house, aes(x = price)) +
geom_density() + ggtitle('Densidade Preço das Casas')

preco_quarto<- aggregate(price~bedrooms, data= house, FUN= mean)
funModeling::freq(house$bedrooms)

## var frequency percentage cumulative_perc
## 1 3 9824 45.49 45.49
## 2 4 6882 31.87 77.36
## 3 2 2760 12.78 90.14
## 4 5 1601 7.41 97.55
## 5 6 272 1.26 98.81
## 6 1 196 0.91 99.72
## 7 7 38 0.18 99.90
## 8 8 13 0.06 99.96
## 9 9 6 0.03 99.99
## 10 10 3 0.01 100.00
## 11 11 1 0.00 100.00
## 12 33 1 0.00 100.00
funModeling::freq(house$grade)

## var frequency percentage cumulative_perc
## 1 7 8974 41.55 41.55
## 2 8 6065 28.08 69.63
## 3 9 2615 12.11 81.74
## 4 6 2038 9.44 91.18
## 5 10 1134 5.25 96.43
## 6 11 399 1.85 98.28
## 7 5 242 1.12 99.40
## 8 12 89 0.41 99.81
## 9 4 27 0.13 99.94
## 10 13 13 0.06 100.00
## 11 3 1 0.00 100.00
funModeling::freq(house$floors)

## var frequency percentage cumulative_perc
## 1 1 10673 49.42 49.42
## 2 2 8235 38.13 87.55
## 3 1.5 1910 8.84 96.39
## 4 3 611 2.83 99.22
## 5 2.5 161 0.75 99.97
## 6 3.5 7 0.03 100.00