tidyverselibrary(tidyverse)
É importante informar quais são os strings que representam NA´s
airbnb <- read.csv("airbnb.csv", na.strings = c('NA', ''), stringsAsFactors = FALSE)
# VERIFICANDO AS VARIÁVEIS
str(airbnb)
## 'data.frame': 49075 obs. of 15 variables:
## $ cont : int 1 2 3 4 5 6 7 8 9 10 ...
## $ number : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ area : chr "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ location : chr "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room : chr "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ mrev : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ last : chr "2018-10-19" "2019-05-21" NA "2019-07-05" ...
## $ host_listing: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability: chr "365" "355" "365" "194" ...
airbnb$last <- as.Date(airbnb$last, format = "%Y-%m-%d")
str(airbnb)
## 'data.frame': 49075 obs. of 15 variables:
## $ cont : int 1 2 3 4 5 6 7 8 9 10 ...
## $ number : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ area : chr "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ location : chr "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room : chr "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ mrev : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ last : Date, format: "2018-10-19" "2019-05-21" ...
## $ host_listing: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability: chr "365" "355" "365" "194" ...
# com `dplyr`
airbnb <- airbnb %>% dplyr::rename(data = last)
Em R básico
colnames(airbnb)[colnames(airbnb) == 'last'] <- 'date'
Total de NA’s em cada coluna
apply(is.na(airbnb), 2, sum)
## cont number host area location latitude
## 0 7532 7549 7532 7532 7532
## longitude room price nights reviews mrev
## 7532 7532 7532 7532 7532 16179
## data host_listing availability
## 16179 7532 7532
# ou
sapply(X = airbnb, FUN = function(x) sum(is.na(x)))
## cont number host area location latitude
## 0 7532 7549 7532 7532 7532
## longitude room price nights reviews mrev
## 7532 7532 7532 7532 7532 16179
## data host_listing availability
## 16179 7532 7532
#ou
colSums(is.na(airbnb))
## cont number host area location latitude
## 0 7532 7549 7532 7532 7532
## longitude room price nights reviews mrev
## 7532 7532 7532 7532 7532 16179
## data host_listing availability
## 16179 7532 7532
# podemos facilitar a visualização dos NA's do seguinte modo:
data.frame(var = colnames(airbnb), nas = colSums(is.na(airbnb)))
## var nas
## cont cont 0
## number number 7532
## host host 7549
## area area 7532
## location location 7532
## latitude latitude 7532
## longitude longitude 7532
## room room 7532
## price price 7532
## nights nights 7532
## reviews reviews 7532
## mrev mrev 16179
## data data 16179
## host_listing host_listing 7532
## availability availability 7532
Com a exceção das variáveis ‘host’, ‘mrev’ e ‘data’, há 7532 NA´s em todas as variáveis.
Vamos eliminá-las do data frame utilizando a variável ‘number’ (poderia ser qualquer outra com 7532 NA’s)
airbnb <- dplyr::filter(airbnb, !is.na(airbnb$number))
nrow(airbnb)
## [1] 41543
Alternativamente com o %>% (pipe):
airbnb <- airbnb %>% filter(!is.na(airbnb$number))
Observe o número de linhas. As observações vazias foram excluidas.
ex-ante
head(airbnb)
## cont number host area location latitude longitude
## 1 1 2787 John Brooklyn Kensington 40.64749 -73.97237
## 2 2 2845 Jennifer Manhattan Midtown 40.75362 -73.98377
## 3 3 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190
## 4 4 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976
## 5 5 7192 Laura Manhattan East Harlem 40.79851 -73.94399
## 6 6 7322 Chris Manhattan Murray Hill 40.74767 -73.97500
## room price nights reviews mrev data host_listing
## 1 Private room 149 1 9 0.21 2018-10-19 6
## 2 Entire home/apt 225 1 45 0.38 2019-05-21 2
## 3 Private room 150 3 0 NA <NA> 1
## 4 Entire home/apt 89 1 270 4.64 2019-07-05 1
## 5 Entire home/apt 80 10 9 0.10 2018-11-19 1
## 6 Entire home/apt 200 3 74 0.59 2019-06-22 1
## availability
## 1 365
## 2 355
## 3 365
## 4 194
## 5 0
## 6 129
ex-post: o elegante tibble
airbnb <- tibble::as_tibble(airbnb)
airbnb
## # A tibble: 41,543 × 15
## cont number host area location latitude longitude room price nights
## <int> <int> <chr> <chr> <chr> <dbl> <dbl> <chr> <int> <int>
## 1 1 2787 John Broo… Kensing… 40.6 -74.0 Priv… 149 1
## 2 2 2845 Jennifer Manh… Midtown 40.8 -74.0 Enti… 225 1
## 3 3 4632 Elisabeth Manh… Harlem 40.8 -73.9 Priv… 150 3
## 4 4 4869 LisaRoxanne Broo… Clinton… 40.7 -74.0 Enti… 89 1
## 5 5 7192 Laura Manh… East Ha… 40.8 -73.9 Enti… 80 10
## 6 6 7322 Chris Manh… Murray … 40.7 -74.0 Enti… 200 3
## 7 7 7356 Garon Broo… Bedford… 40.7 -74.0 Priv… 60 45
## 8 8 8967 Shunichi Manh… Hell's … 40.8 -74.0 Priv… 79 2
## 9 9 7490 MaryEllen Manh… Upper W… 40.8 -74.0 Priv… 79 2
## 10 10 7549 Ben Manh… Chinato… 40.7 -74.0 Enti… 150 1
## # ℹ 41,533 more rows
## # ℹ 5 more variables: reviews <int>, mrev <dbl>, data <date>,
## # host_listing <int>, availability <chr>
str(airbnb)
## tibble [41,543 × 15] (S3: tbl_df/tbl/data.frame)
## $ cont : int [1:41543] 1 2 3 4 5 6 7 8 9 10 ...
## $ number : int [1:41543] 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host : chr [1:41543] "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ area : chr [1:41543] "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ location : chr [1:41543] "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num [1:41543] 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num [1:41543] -74 -74 -73.9 -74 -73.9 ...
## $ room : chr [1:41543] "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int [1:41543] 149 225 150 89 80 200 60 79 79 150 ...
## $ nights : int [1:41543] 1 1 3 1 10 3 45 2 2 1 ...
## $ reviews : int [1:41543] 9 45 0 270 9 74 49 430 118 160 ...
## $ mrev : num [1:41543] 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ data : Date[1:41543], format: "2018-10-19" "2019-05-21" ...
## $ host_listing: int [1:41543] 6 2 1 1 1 1 1 1 1 4 ...
## $ availability: chr [1:41543] "365" "355" "365" "194" ...
Inicie utilizando a função ExPanD(airbnb) do pacote
ExPanDaR
airbnb <- airbnb %>%
select(price, area, location, latitude, longitude, room, nights,
reviews, mrev, host_listing, data, availability)
summary(airbnb)
## price area location latitude
## Min. : 0.0 Length:41543 Length:41543 Min. :40.50
## 1st Qu.: 69.0 Class :character Class :character 1st Qu.:40.69
## Median : 109.0 Mode :character Mode :character Median :40.72
## Mean : 153.8 Mean :40.73
## 3rd Qu.: 178.0 3rd Qu.:40.76
## Max. :10000.0 Max. :40.91
##
## longitude room nights reviews
## Min. :-74.24 Length:41543 Min. : 1.000 Min. : 0.00
## 1st Qu.:-73.98 Class :character 1st Qu.: 1.000 1st Qu.: 1.00
## Median :-73.96 Mode :character Median : 2.000 Median : 5.00
## Mean :-73.95 Mean : 6.933 Mean : 23.17
## 3rd Qu.:-73.94 3rd Qu.: 5.000 3rd Qu.: 23.00
## Max. :-73.72 Max. :1250.000 Max. :629.00
##
## mrev host_listing data availability
## Min. : 0.010 Min. : 1.00 Min. :2011-04-25 Length:41543
## 1st Qu.: 0.190 1st Qu.: 1.00 1st Qu.:2018-07-07 Class :character
## Median : 0.720 Median : 1.00 Median :2019-05-19 Mode :character
## Mean : 1.374 Mean : 6.95 Mean :2018-10-03
## 3rd Qu.: 2.030 3rd Qu.: 2.00 3rd Qu.:2019-06-23
## Max. :58.500 Max. :327.00 Max. :2019-07-08
## NA's :8647 NA's :8647
Experimente com outras funções de descrição de dados.
Por exemplo:
psych::describe(airbnb)
## vars n mean sd median trimmed mad min max
## price 1 41543 153.84 243.55 109.00 121.94 72.65 0.00 10000.00
## area* 2 41543 2.67 0.73 3.00 2.61 1.48 1.00 5.00
## location* 3 41543 106.36 67.81 93.00 105.10 85.99 1.00 217.00
## latitude 4 41543 40.73 0.05 40.72 40.73 0.05 40.50 40.91
## longitude 5 41543 -73.95 0.05 -73.96 -73.96 0.04 -74.24 -73.72
## room* 6 41543 1.51 0.55 1.00 1.48 0.00 1.00 3.00
## nights 7 41543 6.93 20.38 2.00 3.51 1.48 1.00 1250.00
## reviews 8 41543 23.17 44.64 5.00 12.33 7.41 0.00 629.00
## mrev 9 32896 1.37 1.69 0.72 1.07 0.92 0.01 58.50
## host_listing 10 41543 6.95 32.61 1.00 1.51 0.00 1.00 327.00
## data 11 32896 NaN NA NA NaN NA Inf -Inf
## availability* 12 41543 154.65 150.51 120.00 143.60 176.43 1.00 426.00
## range skew kurtosis se
## price 10000.00 18.84 566.11 1.19
## area* 4.00 0.35 -0.08 0.00
## location* 216.00 0.25 -1.26 0.33
## latitude 0.41 0.23 0.14 0.00
## longitude 0.53 1.32 5.19 0.00
## room* 2.00 0.43 -0.95 0.00
## nights 1249.00 22.16 878.53 0.10
## reviews 629.00 3.76 20.48 0.22
## mrev 58.49 3.33 48.71 0.01
## host_listing 326.00 8.38 75.15 0.16
## data -Inf NA NA NA
## availability* 425.00 0.33 -1.49 0.74
price possui valor bem acima do terceiro
quartil.
nights também possui possui valores bem acima do
terceiro quartil.
reviews e mrev idem (mrev
tambem possui NA’s).
host_listing ibdem.
data possui NA’s.
priceg1 <- ggplot(data = airbnb) +
geom_boxplot(aes(x = area, y =price)) +
scale_y_continuous(trans = 'log10') +
ylab('log(price)')
g2 <- ggplot(data = airbnb, aes(x = price))+
geom_histogram(fill = 'steelblue', col = 'black') +
xlim(0, 1000)
library(gridExtra)
grid.arrange(g1, g2, nrow = 2)
pricesummary(airbnb$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 69.0 109.0 153.8 178.0 10000.0
# o terceiro quantil (75%) tem price = US$ 178
# VERIFICANDO OS QUANTIS. A CRITERIO DO ANALISTA
# VAMOS COM 5%
quantile(airbnb$price, probs = 0.05, na.rm = TRUE)
## 5%
## 40
quantile(airbnb$price, probs = 0.95, na.rm = TRUE)
## 95%
## 365
Com quantis de 5% e 95% os preços estão no intervalo de 40 a 365.
Visualizando price em relacao à área (distritos de NYC)
e ao tipo de acomodação
g1 <- ggplot(data = airbnb) +
geom_boxplot(aes(x = area, y = price)) +
scale_y_continuous(trans = 'log10') +
ylab('log(price)') +
theme(axis.text.x = element_text(size = rel(0.75)))
g2 <- ggplot(data = airbnb, aes(x = price)) +
geom_histogram(fill = 'steelblue', col = 'black') +
xlim(0, 1000)
g3 <- ggplot(data = airbnb, aes(x = room, y = price)) +
geom_boxplot() +
scale_y_continuous(trans = 'log10') +
ylab('log(price)') +
theme(axis.text.x = element_text(size = rel(0.85)))
grid.arrange(g1, g2, g3, nrow = 3)
select() - seleciona variáveis
group_by() - agrupa os dados
summarise() - sumariza os dados
df <- airbnb %>%
select(data, price) %>%
group_by(data) %>%
summarise(mediana = median(price, na.rm = TRUE))
df é um data frame de medianas de preços por data.
Preços medianos de 01.jan.2015 ate 08.jul.2019!
g4 <- ggplot(data = df, aes(x = data, y = mediana)) +
geom_line(col = "steelblue") +
xlim(as.Date(c("2015-01-01", "2019-07-08")))
grid.arrange(g1, g2, g3, g4, ncol = 2)
df_man <- airbnb %>%
filter(area == 'Manhattan')
ggplot(data = df_man) +
geom_boxplot(aes(x = area, y = price), fill = 'steelblue') +
scale_y_continuous(trans = 'log10') +
ylab('log(price)')
Calculando as medianas dos preços por data:
df_man1 <- df_man %>%
group_by(data) %>%
summarise(mediana = median(price, na.rm = TRUE))
m1 <- ggplot(data = df_man) + # note que a data = df_man
geom_boxplot(aes(x = area, y = price)) +
scale_y_continuous(trans = 'log10') +
ylab('log(price)')
m2 <- ggplot(data = df_man, aes(x = price)) +
geom_histogram() +
xlim(0, 1000)
m3 <- ggplot(data = df_man, aes(x = room, y = price))+
geom_boxplot() +
scale_y_continuous(trans = 'log10') +
ylab('log(price)') +
theme(axis.text.x = element_text(size = rel(0.75)))
m4 <- ggplot(data = df_man1, aes(x = data, y = mediana))+ #<==== data = df_man1
geom_line(col = "darkred") +
xlim(as.Date(c("2015-01-01", "2019-07-08"))) +
scale_y_continuous(trans = 'log10') +
ylab('log(mediana do preço)')
grid.arrange(m1, m2, m3, m4, ncol = 2)
summary(df_man$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 95.0 150.0 197.5 220.0 10000.0
df_man2 <- df_man %>%
group_by(location) %>%
summarise(mediana = median(price, na.rm = TRUE))
m1 <- ggplot(data = df_man2, aes(x = reorder(location, mediana),
y = mediana,
fill = mediana)) + # note que a data = df2
xlab('Bairro') +
geom_bar(stat = "identity", fill ="red") +
coord_flip()
m1
nightsdf_n <- airbnb
max(df_n$nights, na.rm = TRUE)
## [1] 1250
df_n
## # A tibble: 41,543 × 12
## price area location latitude longitude room nights reviews mrev
## <int> <chr> <chr> <dbl> <dbl> <chr> <int> <int> <dbl>
## 1 149 Brooklyn Kensington 40.6 -74.0 Priv… 1 9 0.21
## 2 225 Manhattan Midtown 40.8 -74.0 Enti… 1 45 0.38
## 3 150 Manhattan Harlem 40.8 -73.9 Priv… 3 0 NA
## 4 89 Brooklyn Clinton Hill 40.7 -74.0 Enti… 1 270 4.64
## 5 80 Manhattan East Harlem 40.8 -73.9 Enti… 10 9 0.1
## 6 200 Manhattan Murray Hill 40.7 -74.0 Enti… 3 74 0.59
## 7 60 Brooklyn Bedford-Stuyve… 40.7 -74.0 Priv… 45 49 0.4
## 8 79 Manhattan Hell's Kitchen 40.8 -74.0 Priv… 2 430 3.47
## 9 79 Manhattan Upper West Side 40.8 -74.0 Priv… 2 118 0.99
## 10 150 Manhattan Chinatown 40.7 -74.0 Enti… 1 160 1.33
## # ℹ 41,533 more rows
## # ℹ 3 more variables: host_listing <int>, data <date>, availability <chr>
summary(df_n$nights)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 6.933 5.000 1250.000
# histograma
ggplot(data = df_n, aes(x = nights)) +
geom_histogram()
# Avaliando os quantis
quantile(df_n$nights, probs = 0.05, na.rm = TRUE)
## 5%
## 1
qupper <- quantile(df_n$nights, probs = 0.95, na.rm = TRUE)
# Filtrando no intervalo dos percentis
df_n <- filter(df_n, nights >= 1 & nights <= 15)
min(df_n$nights)
## [1] 1
max(df_n$nights)
## [1] 15
# Plotando
n1 <- ggplot(data = df_n, aes(x = nights)) +
geom_histogram()
n2 <- ggplot(data = df_n)+
geom_boxplot(aes(x = area, y = nights))
grid.arrange(n1, n2, nrow = 2)
Ao que parece o pessoal fica mais tempo em Manhattan:
# Revendo o script efetuado, temos:
df_n1 <- filter(df_n, area == 'Manhattan')
summary(df_n1$nights)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.993 4.000 15.000
n1 <- ggplot(data= df_n1, aes(x= area, y = nights)) +
geom_boxplot()
n2 <- ggplot(data=df_n1, aes(nights)) +
geom_histogram()
grid.arrange(n1, n2, nrow = 2)
reviews em função dos preçosggplot(data = airbnb, aes(x = price, y = reviews)) +
geom_point(color = 'gray50') +
scale_x_continuous(trans = 'log10') +
xlab('log(preço)')
cor(airbnb$price, airbnb$reviews)
## [1] -0.04897622
# aparentemente, a correlacao é quase nula
rev1 <- ggplot(data = airbnb) +
geom_boxplot(aes(x = area, y = reviews)) +
scale_y_continuous(trans = 'log10') +
ylab('log(reviews)')
rev2 <- ggplot( data = airbnb) +
geom_histogram(aes(x = reviews))
grid.arrange(rev1, rev2, nrow = 2)
summary(airbnb$reviews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 5.00 23.17 23.00 629.00
quantile(airbnb$reviews, probs = 0.05, na.rm= TRUE)
## 5%
## 0
quantile(airbnb$reviews, probs = 0.95, na.rm = TRUE)
## 95%
## 114
reviewsairbnb
## # A tibble: 41,543 × 12
## price area location latitude longitude room nights reviews mrev
## <int> <chr> <chr> <dbl> <dbl> <chr> <int> <int> <dbl>
## 1 149 Brooklyn Kensington 40.6 -74.0 Priv… 1 9 0.21
## 2 225 Manhattan Midtown 40.8 -74.0 Enti… 1 45 0.38
## 3 150 Manhattan Harlem 40.8 -73.9 Priv… 3 0 NA
## 4 89 Brooklyn Clinton Hill 40.7 -74.0 Enti… 1 270 4.64
## 5 80 Manhattan East Harlem 40.8 -73.9 Enti… 10 9 0.1
## 6 200 Manhattan Murray Hill 40.7 -74.0 Enti… 3 74 0.59
## 7 60 Brooklyn Bedford-Stuyve… 40.7 -74.0 Priv… 45 49 0.4
## 8 79 Manhattan Hell's Kitchen 40.8 -74.0 Priv… 2 430 3.47
## 9 79 Manhattan Upper West Side 40.8 -74.0 Priv… 2 118 0.99
## 10 150 Manhattan Chinatown 40.7 -74.0 Enti… 1 160 1.33
## # ℹ 41,533 more rows
## # ℹ 3 more variables: host_listing <int>, data <date>, availability <chr>
df_r <- filter(airbnb, reviews >= 0 & reviews <= 114)
summary(df_r$reviews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 4.00 15.11 18.00 114.00
ggplot(data = df_r, aes(reviews)) +
geom_histogram(col="blue")
# Um boxplot incrementado
r2 <- ggplot(data = df_r, aes(x = area, y = reviews)) +
geom_boxplot() +
stat_summary(fun.y = median, geom = "line", aes(group = 5), linetype = 3) +
stat_summary(fun.y = median, geom = "point", colour = "red") +
scale_y_continuous(trans = 'log10') +
ylab('log(reviews)')
r3 <- ggplot(data = airbnb, aes(x = area, y = price)) +
geom_boxplot() +
stat_summary(fun.y = median, geom = "line", aes(group = 5), linetype = 3) +
stat_summary(fun.y = median, geom = "point", colour = "red") +
scale_y_continuous(trans = 'log10') +
ylab('log(price)')
grid.arrange(r2, r3, nrow = 2)
leafletggplot(data = airbnb) +
geom_point(aes(x = longitude, y = latitude), col = 'blue')
lat_mediana <- median(airbnb$latitude, na.rm = TRUE)
lon_mediana <- median(airbnb$longitude, na.rm = TRUE)
lat_media <- mean(airbnb$latitude, na.rm = TRUE)
lon_media <- mean(airbnb$longitude, na.rm = TRUE)
library(leaflet)
geo <- leaflet() %>%
addTiles() %>% # usa o mapa default
addMarkers(c(lon_mediana, lon_media),
c(lat_mediana, lat_media),
popup = c('Posição Mediana', 'Posição Média'))
geo