## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'jsonlite'
##
##
## The following object is masked from 'package:purrr':
##
## flatten
##
##
##
## Attaching package: 'reshape2'
##
##
## The following object is masked from 'package:tidyr':
##
## smiths
##
##
## here() starts at /home/rstudio
##
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
## tmap mode set to interactive viewing
## Retrieving data for the year 2021
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
# Load yelp data
load('yelp_usedbooks.RData')
load('yelp_artmuseums.RData')
# Extract coordinates
# Function to convert dataframe to sf object
df_to_sf <- function(df) {
df %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
filter(!is.na(x) & !is.na(y)) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
}
# Apply function to datasets
yelp_sf_usedbooks <- df_to_sf(yelp_all_usedbooks)
yelp_sf_artmuseums <- df_to_sf(yelp_all_artmuseums)
# Map
map_usedbooks_bf_tidy <- tm_shape(yelp_sf_usedbooks) +
tm_dots(col = "review_count", style="quantile")
map_artmuseums_bf_tidy <- tm_shape(yelp_sf_artmuseums) +
tm_dots(col = "review_count", style="quantile")
map_artmuseums_bf_tidy
# Delete duplicated rows
drop_duplicates <- function(df, name) {
unique_df <- df %>% distinct(id, .keep_all=T)
print(glue::glue("Before dropping duplicated rows, there were {nrow(df)} rows in {name}. After dropping them, there are {nrow(unique_df)} rows."))
return(unique_df)
}
# Apply function to datasets
unique_usedbooks <- drop_duplicates(yelp_all_usedbooks, "yelp_all_usedbooks")
## Before dropping duplicated rows, there were 20 rows in yelp_all_usedbooks. After dropping them, there are 12 rows.
unique_artmuseums <- drop_duplicates(yelp_all_artmuseums, "yelp_all_artmuseums")
## Before dropping duplicated rows, there were 65 rows in yelp_all_artmuseums. After dropping them, there are 27 rows.
Note: mutate{dplyr} creates new columns that are
functions of existing variables. It can also modify (if the name is the
same as an existing column) and delete columns (by setting their value
to NULL).
# Flatten nested columns that have multiple variables in one column(categories)
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
flat_usedbooks <- unique_usedbooks %>%
mutate(categories = categories %>% map_chr(concate_list))
flat_artmuseums <- unique_artmuseums %>%
mutate(categories = categories %>% str_c(collapse = ", "))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `categories = categories %>% str_c(collapse = ", ")`.
## Caused by warning in `stri_c()`:
## ! argument is not an atomic vector; coercing
print(flat_artmuseums)
## # A tibble: 27 × 16
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <chr> <dbl>
## 1 FCt4zU9… colo… Colo… https://… FALSE http… 560 "list(ali… 4
## 2 lvBT0xJ… van-… Van … https://… FALSE http… 55 "list(ali… 3.5
## 3 bFg9sE5… post… Post… https://… FALSE http… 36 "list(ali… 4.5
## 4 zq8BL3I… hall… Hall… https://… FALSE http… 37 "list(ali… 4
## 5 07BcIHC… stud… Stud… https://… FALSE http… 46 "list(ali… 4
## 6 E672oSw… the-… The … https://… FALSE http… 517 "list(ali… 4.5
## 7 Ch4aTjv… loui… Loui… https://… FALSE http… 10 "list(ali… 4.5
## 8 9SfNHqx… bank… Bank… https://… FALSE http… 1 "list(ali… 5
## 9 JXERBrs… suga… Suga… https://… FALSE http… 12 "list(ali… 4.5
## 10 Uplh1Hi… the-… The … https://… FALSE http… 342 "list(ali… 4.5
## # ℹ 17 more rows
## # ℹ 7 more variables: coordinates <df[,2]>, transactions <list>,
## # location <df[,8]>, phone <chr>, display_phone <chr>, distance <dbl>,
## # price <chr>
Note: mutate{dplyr} creates new columns that are
functions of existing variables. It can also modify (if the name is the
same as an existing column) and delete columns (by setting their value
to NULL).
# Delete rows that have missing data in **coordinates** variable.
flat_usedbooks_1 <- select(flat_usedbooks, -transactions)
flat_artmuseums_1 <- select(flat_artmuseums, -transactions)
business_list <- list(flat_usedbooks, flat_artmuseums)
for(df in business_list) {
cat("Column names: ", toString(colnames(df)), "\n",
"Class of 'coordinates': ", class(df$coordinates), "\n",
"Number of NA in 'coordinates': ", sum(is.na(df$coordinates)), "\n")
}
## Column names: id, alias, name, image_url, is_closed, url, review_count, categories, rating, coordinates, transactions, price, location, phone, display_phone, distance
## Class of 'coordinates': data.frame
## Number of NA in 'coordinates': 0
## Column names: id, alias, name, image_url, is_closed, url, review_count, categories, rating, coordinates, transactions, location, phone, display_phone, distance, price
## Class of 'coordinates': data.frame
## Number of NA in 'coordinates': 0
# There are no missing values in coordinates columns :)
# Convert
flat_usedbooks_sf <- df_to_sf(flat_usedbooks_1)
flat_artmuseums_sf <- df_to_sf(flat_artmuseums_1)
# Delete rows that fall outside of the boundary of your choice.
tract <- st_set_crs(tract, 4326)
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
in_usedbooks <- flat_usedbooks_sf[tract %>% st_union(), ,op = st_intersects]
map_usedbooks_tidy <- tm_shape(in_usedbooks) + tm_dots(size=0.05, col = "review_count", style="quantile", palette = "Blues")
#+ tm_text("name", size=0.8 )
map_usedbooks_tidy
# Delete rows that fall outside of the boundary of your choice.
in_artmuseums <- flat_artmuseums_sf[tract %>% st_union(), ,op = st_intersects]
map_artmuseums_tidy <- tm_shape(in_artmuseums) + tm_dots(col = "review_count", style="quantile", palette = "Blues")
map_artmuseums_tidy
1.Have the findings changed before and after the tidying the data?
tmap_arrange(map_artmuseums_bf_tidy, map_artmuseums_tidy)
#freq_table <- flat_usedbooks_1$rating
mode <- function(x){which.max(tabulate(x))}
freq_table <- flat_artmuseums_1$rating
most_frequent <- mode(freq_table)
cat("The most art museum frequent rating score is", most_frequent)
## The most art museum frequent rating score is 4
correlation <- cor(flat_artmuseums_1$rating, flat_artmuseums_1$review_count)
cat("The correlation coefficient value is", correlation, ",indicating a weak positive correlation between rating score and review counts.")
## The correlation coefficient value is 0.1995933 ,indicating a weak positive correlation between rating score and review counts.
# In the tmap package, the order of the layers in your code determines the order in which they are drawn on the map. To move the dots to the front (i.e., draw them on top of the polygons), I should call tm_dots after tm_polygons
pattern_income_artmuseums <-
tm_shape(tract) +
tm_polygons(col = "hhincomeE", style="quantile", colorNA = NA, alpha = 0.6) +
tm_shape(in_artmuseums) +
tm_dots(col = "review_count", style="quantile", palette = "Blues")
pattern_income_usedbooks <-
tm_shape(tract) +
tm_polygons(col = "hhincomeE", style="quantile", colorNA = NA, alpha = 0.6) +
tm_shape(in_usedbooks) +
tm_dots(col = "review_count", style="quantile", palette = "Blues")
tmap_arrange(pattern_income_artmuseums, pattern_income_usedbooks)