library(readr)
library(dplyr)
library(ggplot2)
library(funModeling)
house <- read_csv("/Users/Clevia/Documents/MLG/kc_house_data.csv")

summary(house)
##        id                date               price            bedrooms     
##  Min.   :1.000e+06   Length:21597       Min.   :  78000   Min.   : 1.000  
##  1st Qu.:2.123e+09   Class :character   1st Qu.: 322000   1st Qu.: 3.000  
##  Median :3.905e+09   Mode  :character   Median : 450000   Median : 3.000  
##  Mean   :4.580e+09                      Mean   : 540297   Mean   : 3.373  
##  3rd Qu.:7.309e+09                      3rd Qu.: 645000   3rd Qu.: 4.000  
##  Max.   :9.900e+09                      Max.   :7700000   Max.   :33.000  
##    bathrooms      sqft_living       sqft_lot           floors     
##  Min.   :0.500   Min.   :  370   Min.   :    520   Min.   :1.000  
##  1st Qu.:1.750   1st Qu.: 1430   1st Qu.:   5040   1st Qu.:1.000  
##  Median :2.250   Median : 1910   Median :   7618   Median :1.500  
##  Mean   :2.116   Mean   : 2080   Mean   :  15099   Mean   :1.494  
##  3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10685   3rd Qu.:2.000  
##  Max.   :8.000   Max.   :13540   Max.   :1651359   Max.   :3.500  
##    waterfront            view          condition        grade       
##  Min.   :0.000000   Min.   :0.0000   Min.   :1.00   Min.   : 3.000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.00   1st Qu.: 7.000  
##  Median :0.000000   Median :0.0000   Median :3.00   Median : 7.000  
##  Mean   :0.007547   Mean   :0.2343   Mean   :3.41   Mean   : 7.658  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.00   3rd Qu.: 8.000  
##  Max.   :1.000000   Max.   :4.0000   Max.   :5.00   Max.   :13.000  
##    sqft_above   sqft_basement       yr_built     yr_renovated    
##  Min.   : 370   Min.   :   0.0   Min.   :1900   Min.   :   0.00  
##  1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951   1st Qu.:   0.00  
##  Median :1560   Median :   0.0   Median :1975   Median :   0.00  
##  Mean   :1789   Mean   : 291.7   Mean   :1971   Mean   :  84.46  
##  3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997   3rd Qu.:   0.00  
##  Max.   :9410   Max.   :4820.0   Max.   :2015   Max.   :2015.00  
##     zipcode           lat             long        sqft_living15 
##  Min.   :98001   Min.   :47.16   Min.   :-122.5   Min.   : 399  
##  1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3   1st Qu.:1490  
##  Median :98065   Median :47.57   Median :-122.2   Median :1840  
##  Mean   :98078   Mean   :47.56   Mean   :-122.2   Mean   :1987  
##  3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1   3rd Qu.:2360  
##  Max.   :98199   Max.   :47.78   Max.   :-121.3   Max.   :6210  
##    sqft_lot15    
##  Min.   :   651  
##  1st Qu.:  5100  
##  Median :  7620  
##  Mean   : 12758  
##  3rd Qu.: 10083  
##  Max.   :871200
glimpse(house)
## Rows: 21,597
## Columns: 21
## $ id            <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 19544005~
## $ date          <chr> "10/13/2014", "12/9/2014", "2/25/2015", "12/9/2014", "2/~
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,~
## $ bedrooms      <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,~
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.~
## $ sqft_living   <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189~
## $ sqft_lot      <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,~
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1~
## $ waterfront    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ view          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,~
## $ condition     <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,~
## $ grade         <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7~
## $ sqft_above    <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189~
## $ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, ~
## $ yr_built      <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20~
## $ yr_renovated  <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ zipcode       <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, ~
## $ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47~
## $ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.0~
## $ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23~
## $ sqft_lot15    <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, ~
ggplot(data = house, aes(x = price)) +
  geom_histogram(bins = 100, fill='blue', color ='black')

ggplot(data = house, aes(x = price)) +
  geom_density() + ggtitle('Densidade Preço das Casas')

preco_quarto<- aggregate(price~bedrooms, data= house, FUN= mean)

funModeling::freq(house$bedrooms)

##    var frequency percentage cumulative_perc
## 1    3      9824      45.49           45.49
## 2    4      6882      31.87           77.36
## 3    2      2760      12.78           90.14
## 4    5      1601       7.41           97.55
## 5    6       272       1.26           98.81
## 6    1       196       0.91           99.72
## 7    7        38       0.18           99.90
## 8    8        13       0.06           99.96
## 9    9         6       0.03           99.99
## 10  10         3       0.01          100.00
## 11  11         1       0.00          100.00
## 12  33         1       0.00          100.00
funModeling::freq(house$grade)

##    var frequency percentage cumulative_perc
## 1    7      8974      41.55           41.55
## 2    8      6065      28.08           69.63
## 3    9      2615      12.11           81.74
## 4    6      2038       9.44           91.18
## 5   10      1134       5.25           96.43
## 6   11       399       1.85           98.28
## 7    5       242       1.12           99.40
## 8   12        89       0.41           99.81
## 9    4        27       0.13           99.94
## 10  13        13       0.06          100.00
## 11   3         1       0.00          100.00
funModeling::freq(house$floors)

##   var frequency percentage cumulative_perc
## 1   1     10673      49.42           49.42
## 2   2      8235      38.13           87.55
## 3 1.5      1910       8.84           96.39
## 4   3       611       2.83           99.22
## 5 2.5       161       0.75           99.97
## 6 3.5         7       0.03          100.00

histograma com a média de preço pela quantidade de quartos

ggplot(preco_quarto, aes(y = price, x = bedrooms)) +
  geom_bar(stat = "identity")