# Toy dataset
toy_df <- data.frame(name = c("John", "Jane", "Mary"),
treatment_a = c(NA, 16, 3),
treatment_b = c(2, 11, 1),
treatment_c = c(6, 12, NA))
print(toy_df)
## name treatment_a treatment_b treatment_c
## 1 John NA 2 6
## 2 Jane 16 11 12
## 3 Mary 3 1 NA
# pivot longer
(toy_long <- toy_df %>%
pivot_longer(cols = treatment_a:treatment_c,
names_to = 'treatment', # new column name for 'cols' in character
values_to = 'result')) # new name for the column storing the values in character
## # A tibble: 9 × 3
## name treatment result
## <chr> <chr> <dbl>
## 1 John treatment_a NA
## 2 John treatment_b 2
## 3 John treatment_c 6
## 4 Jane treatment_a 16
## 5 Jane treatment_b 11
## 6 Jane treatment_c 12
## 7 Mary treatment_a 3
## 8 Mary treatment_b 1
## 9 Mary treatment_c NA
# back to wider
(toy_wide <- toy_long %>%
pivot_wider(id_cols = name, # unique identifier
names_from = treatment, # from which columns should the new column names come?
values_from = result)) # from which columns should the values come?
## # A tibble: 3 × 4
## name treatment_a treatment_b treatment_c
## <chr> <dbl> <dbl> <dbl>
## 1 John NA 2 6
## 2 Jane 16 11 12
## 3 Mary 3 1 NA
dupl_df <- data.frame(name = c("A", "A", "B", "C", "C", "C", "D"),
GPA = c(3.5, 3.5, 4.0, 2.0, 3.0, 3.0, 2.0))
# Base R
duplicated(dupl_df$name)
## [1] FALSE TRUE FALSE FALSE TRUE TRUE FALSE
The output is “## [1] FALSE TRUE FALSE FALSE TRUE TRUE FALSE”. The 1st value of name is “A”. The “A” shows up again in the 2rd value. So the 2rd value of output is TRUE which means that the “A” of 2rd place is duplicated.
# Duplicates in column "name" removed.
dupl_df[!duplicated(dupl_df$name),]
## name GPA
## 1 A 3.5
## 3 B 4.0
## 4 C 2.0
## 7 D 2.0
# Returns a vector, not data frame
dupl_df %>%
distinct(name) # Try adding .keep_all = TRUE argument
## name
## 1 A
## 2 B
## 3 C
## 4 D
The output of “distinct(name, GPA)” has two “C” but only one “A” because “C” has two corresponding values.
# Returns a data frame
dupl_df %>%
distinct(name, GPA)
## name GPA
## 1 A 3.5
## 2 B 4.0
## 3 C 2.0
## 4 C 3.0
## 5 D 2.0
The ‘distinct(name, GPA)’ will output the row which (name, GPA) is unique.
Type1: Different variables can be concatenated into a long string. Solution: Using ‘separate()’ from tidyr package
# A character vector to split
onecol_df <- data.frame(labels = c('a1','b_2','c_3_2','d_4_1'))
# split the character at _
onecol_df %>% separate(col = "labels", sep = "_", into = c("alphabet", "numeric"))
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [3, 4].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].
## alphabet numeric
## 1 a1 <NA>
## 2 b 2
## 3 c 3
## 4 d 4
Type2: One column in a data frame can contain another data frame or a list. Solution: It is not wrong data. Just making it looks more simple.
# Read a subset of Yelp data we downloaded last week
yelp_subset <- read_rds('https://raw.githubusercontent.com/ujhwang/UrbanAnalytics2023/main/Lab/module_1/week2/yelp_subset.rds')
# Print to see what's inside
yelp_subset %>%
tibble() %>%
print(width = 1000)
## # A tibble: 10 × 5
## id categories transactions coordinates$latitude
## <chr> <list> <list> <dbl>
## 1 eG-UO83g_5zDk70FIJbm2w <df [1 × 2]> <chr [1]> 33.8
## 2 lmqNHL01VGjOPMnO_HJ9RQ <df [3 × 2]> <chr [2]> 33.8
## 3 _iqFvc3zToL08WZrNeFP3Q <df [3 × 2]> <chr [2]> 33.8
## 4 tDv2qG4N7PsYLN0QYuuaZQ <df [3 × 2]> <chr [2]> 33.8
## 5 3ehyrexo3WcoTy74c2jDKA <df [3 × 2]> <chr [2]> 33.8
## 6 hmrRb7qX3K705MuxHHfgNA <df [3 × 2]> <chr [2]> 33.8
## 7 Gwi9PMVb61nrrpUNa9_wfQ <df [3 × 2]> <chr [1]> 33.8
## 8 Pl2c7HLD9UgFL2zHP0cZVQ <df [1 × 2]> <chr [2]> 33.8
## 9 qS2Gg7_FH8iHFiorOVd7hg <df [3 × 2]> <chr [1]> 33.8
## 10 Z2qMwUhnGt_2pA9uQbS7Uw <df [3 × 2]> <chr [1]> 33.8
## $longitude location$address1 $address2 $address3 $city $zip_code
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 -84.4 1144 Crescent Ave NE "" "" Atlanta 30309
## 2 -84.4 1100 Peachtree St NE "Ste 110" "" Atlanta 30309
## 3 -84.4 1075 Peachtree St NE "" "" Atlanta 30309
## 4 -84.4 60 11th St NE "" <NA> Atlanta 30309
## 5 -84.4 1001 Piedmont Ave "" "" Atlanta 30309
## 6 -84.4 1065 Peachtree St NE "" "" Atlanta 30309
## 7 -84.4 1106 Crescent Ave NE "" "" Atlanta 30309
## 8 -84.4 77TH 12th St NE "Ste 2" "" Atlanta 30309
## 9 -84.4 1123 Peachtree Walk NE "" "" Atlanta 30309
## 10 -84.4 878 Peachtree St NE "" "" Atlanta 30309
## $country $state $display_address
## <chr> <chr> <list>
## 1 US GA <chr [2]>
## 2 US GA <chr [3]>
## 3 US GA <chr [2]>
## 4 US GA <chr [2]>
## 5 US GA <chr [2]>
## 6 US GA <chr [2]>
## 7 US GA <chr [2]>
## 8 US GA <chr [3]>
## 9 US GA <chr [2]>
## 10 US GA <chr [2]>
You need to use the following code to see the details
yelp_subset$coordinates %>% head()
## latitude longitude
## 1 33.78600 -84.38456
## 2 33.78480 -84.38354
## 3 33.78418 -84.38283
## 4 33.78354 -84.38483
## 5 33.78200 -84.38015
## 6 33.78331 -84.38356
Using ‘flatten()’ function in jsonlite package
yelp_flat <- yelp_subset %>%
jsonlite::flatten() %>%
as_tibble()
yelp_flat$coordinates %>% head()
## Warning: Unknown or uninitialised column: `coordinates`.
## NULL
Each component of the list contains a character vector. Using ‘transactionlocation.display_address’ to concatenate them.
# Concatenate what's inside the list
yelp_concat <- yelp_flat %>%
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")))
Type3: In each component of the list, there is a data frame with two columns and varying row numbers Solution: 1. Use lapply or map to loop through each component of the list-column. 2. For each component (which is a data frame), we extract column. 3. For however many components in the extracted column (which now is a vector after the extraction), we do the same thing we just did above–concatenate them all.
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat2 <- yelp_concat %>%
mutate(categories = categories %>% map_chr(concate_list))
yelp_flat2 %>% print(width = 1000)
## # A tibble: 10 × 13
## id categories
## <chr> <chr>
## 1 eG-UO83g_5zDk70FIJbm2w Southern
## 2 lmqNHL01VGjOPMnO_HJ9RQ Southern, Cocktail Bars, Seafood
## 3 _iqFvc3zToL08WZrNeFP3Q Steakhouses, American (New), Cocktail Bars
## 4 tDv2qG4N7PsYLN0QYuuaZQ Spanish, Gastropubs, Tapas Bars
## 5 3ehyrexo3WcoTy74c2jDKA Breakfast & Brunch, Southern, American (New)
## 6 hmrRb7qX3K705MuxHHfgNA Cafes, Bars, Desserts
## 7 Gwi9PMVb61nrrpUNa9_wfQ Seafood, Bars, American (New)
## 8 Pl2c7HLD9UgFL2zHP0cZVQ Indian
## 9 qS2Gg7_FH8iHFiorOVd7hg Italian, Wine Bars, Pasta Shops
## 10 Z2qMwUhnGt_2pA9uQbS7Uw Burgers, American (Traditional), Bars
## transactions coordinates.latitude coordinates.longitude
## <chr> <dbl> <dbl>
## 1 delivery 33.8 -84.4
## 2 delivery, pickup 33.8 -84.4
## 3 delivery, pickup 33.8 -84.4
## 4 delivery, pickup 33.8 -84.4
## 5 delivery, pickup 33.8 -84.4
## 6 delivery, pickup 33.8 -84.4
## 7 delivery 33.8 -84.4
## 8 delivery, pickup 33.8 -84.4
## 9 delivery 33.8 -84.4
## 10 delivery 33.8 -84.4
## location.address1 location.address2 location.address3 location.city
## <chr> <chr> <chr> <chr>
## 1 1144 Crescent Ave NE "" "" Atlanta
## 2 1100 Peachtree St NE "Ste 110" "" Atlanta
## 3 1075 Peachtree St NE "" "" Atlanta
## 4 60 11th St NE "" <NA> Atlanta
## 5 1001 Piedmont Ave "" "" Atlanta
## 6 1065 Peachtree St NE "" "" Atlanta
## 7 1106 Crescent Ave NE "" "" Atlanta
## 8 77TH 12th St NE "Ste 2" "" Atlanta
## 9 1123 Peachtree Walk NE "" "" Atlanta
## 10 878 Peachtree St NE "" "" Atlanta
## location.zip_code location.country location.state
## <chr> <chr> <chr>
## 1 30309 US GA
## 2 30309 US GA
## 3 30309 US GA
## 4 30309 US GA
## 5 30309 US GA
## 6 30309 US GA
## 7 30309 US GA
## 8 30309 US GA
## 9 30309 US GA
## 10 30309 US GA
## location.display_address
## <chr>
## 1 1144 Crescent Ave NE, Atlanta, GA 30309
## 2 1100 Peachtree St NE, Ste 110, Atlanta, GA 30309
## 3 1075 Peachtree St NE, Atlanta, GA 30309
## 4 60 11th St NE, Atlanta, GA 30309
## 5 1001 Piedmont Ave, Atlanta, GA 30309
## 6 1065 Peachtree St NE, Atlanta, GA 30309
## 7 1106 Crescent Ave NE, Atlanta, GA 30309
## 8 77TH 12th St NE, Ste 2, Atlanta, GA 30309
## 9 1123 Peachtree Walk NE, Atlanta, GA 30309
## 10 878 Peachtree St NE, Atlanta, GA 30309
# This is the same toy_df from above
toy_df <- data.frame(name = c("John", "Jane", "Mary"),
treatment_a = c(NA, 16, 3),
treatment_b = c(2, 11, 1),
treatment_c = c(6, 12, NA))
# Dropping NA using is.na()
toy_df %>%
filter(!is.na(treatment_a))
## name treatment_a treatment_b treatment_c
## 1 Jane 16 11 12
## 2 Mary 3 1 NA
drop_na() function will drop every row that have NA in any one of the columns, so be careful with it to avoid deleting too much information.
# This check across all columns and drops all rows that have at least one NA.
toy_df %>%
drop_na()
## name treatment_a treatment_b treatment_c
## 1 Jane 16 11 12
The There must be intersections between the circles of the various census tracts, so there will be duplicates in the yelp data as well. Yelp data provides a unique ID column, so we can use it to identify duplicates.
# Read the full data
my_yelp <- read_rds('C:/Users/49765/Desktop/Urban Analytics/mini1/yelp_all.rds')
# Issue 2 ------------------------------
yelp_unique <- my_yelp %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping duplicated rows, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping duplicated rows, there were 166 rows. After dropping them, there are 32 rows
This is the main issue in Yelp data, caused by the original data format being JSON.
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list)) # concate_list is the custom function
Identify whether there exists any NA values. The results show that only the address is missing, so the data is not processed.
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions phone display_phone
## 0 0 0
## distance coordinates.latitude coordinates.longitude
## 0 0 0
## location.address1 location.address2 location.address3
## 0 1 5
## location.city location.zip_code location.country
## 0 0 0
## location.state location.display_address
## 0 0
Sf package cannot handle NAs in coordinates. So we must drop the four NAs in coordinates.
# Fist, let's verify that the 4 missing values in lat/long columns are in the same rows.
identical(is.na(yelp_flat$coordinates.latitude),
is.na(yelp_flat$coordinates.longitude)) # Yes, they are in the same 4 rows.
## [1] TRUE
# Drop them.
yelp_dropna1 <- yelp_flat %>%
drop_na(coordinates.longitude)
Delete points that fall outside of our boundary.
# census boundary
census <- st_read('C:/Users/49765/Desktop/Urban Analytics/mini1/washington.geojson')
## Reading layer `washington' from data source
## `C:\Users\49765\Desktop\Urban Analytics\mini1\washington.geojson'
## using driver `GeoJSON'
## Simple feature collection with 206 features and 5 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -77.11976 ymin: 38.79165 xmax: -76.9094 ymax: 38.99511
## Geodetic CRS: NAD83
# Converting yelp_dropna1 into a sf object
yelp_sf <- yelp_dropna1 %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
st_crs(census) <- 4326
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
# sf subsets
yelp_in <- yelp_sf[census %>%
filter() %>%
st_union(), ,op = st_intersects]
glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>%
print()
## nrow before: 166 -> nrow after: 27
##
## ncol before: 15 -> ncol after: 22
# Visualize
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "rating")
# census is currently sfc. Convert it to sf.
census_sf <- census %>% filter() %>% st_sf()
# Spatial join
census_yelp <- st_join(census_sf, yelp_in, join = st_intersects)
yelp_census <- st_join(yelp_in, census_sf, join = st_intersects)
# number of rows
cat('census_yelp: ', nrow(census_yelp))
## census_yelp: 215
cat('yelp_census: ', nrow(yelp_census))
## yelp_census: 27
# View
census_yelp %>% head()
## Simple feature collection with 6 features and 26 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -77.08563 ymin: 38.87646 xmax: -77.009 ymax: 38.94584
## Geodetic CRS: WGS 84
## GEOID hhincome race.tot race.white race.black id
## 1 11001001002 140772 3489 2710 211 _RaDtIOFuyYCUHLKe59YfA
## 2 11001002801 62323 4104 1727 1104 <NA>
## 3 11001003100 133408 3825 1820 987 <NA>
## 4 11001004001 153650 4785 4126 309 <NA>
## 5 11001010500 92083 3927 2283 1164 <NA>
## 6 11001004600 129896 3169 1517 1384 <NA>
## alias name
## 1 medstar-georgetown-pediatrics-washington-2 MedStar Georgetown Pediatrics
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> <NA>
## 6 <NA> <NA>
## image_url
## 1 https://s3-media3.fl.yelpcdn.com/bphoto/LELlyZqWQAMVqEd6JOF57Q/o.jpg
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## is_closed
## 1 FALSE
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## url
## 1 https://www.yelp.com/biz/medstar-georgetown-pediatrics-washington-2?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## review_count categories rating transactions phone
## 1 20 Hospitals, Pediatricians 1.5 +12022433400
## 2 NA <NA> NA <NA> <NA>
## 3 NA <NA> NA <NA> <NA>
## 4 NA <NA> NA <NA> <NA>
## 5 NA <NA> NA <NA> <NA>
## 6 NA <NA> NA <NA> <NA>
## display_phone distance location.address1 location.address2
## 1 (202) 243-3400 448.3399 4200 Wisconsin Ave NW Fl 4
## 2 <NA> NA <NA> <NA>
## 3 <NA> NA <NA> <NA>
## 4 <NA> NA <NA> <NA>
## 5 <NA> NA <NA> <NA>
## 6 <NA> NA <NA> <NA>
## location.address3 location.city location.zip_code location.country
## 1 Washington, DC 20016 US
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA>
## 5 <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA>
## location.state location.display_address
## 1 DC 4200 Wisconsin Ave NW, Fl 4, Washington, DC 20016
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> <NA>
## 6 <NA> <NA>
## geometry
## 1 POLYGON ((-77.08563 38.9382...
## 2 POLYGON ((-77.03645 38.9349...
## 3 POLYGON ((-77.02826 38.9318...
## 4 POLYGON ((-77.05018 38.9212...
## 5 POLYGON ((-77.01756 38.8850...
## 6 POLYGON ((-77.01811 38.9146...
yelp_census %>% head()
## Simple feature collection with 6 features and 26 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -77.08518 ymin: 38.89949 xmax: -77.00397 ymax: 38.94356
## Geodetic CRS: WGS 84
## id
## 1 _RaDtIOFuyYCUHLKe59YfA
## 2 YmDNjmiYY2iWTZD-NkD1gQ
## 3 AVKphCn6rkDKRrnaBQnGGg
## 4 UGEDcMyuV2aH52RKfMIWeg
## 5 KmkgdnIOuKzw6DZ-j0pJxQ
## 6 AQyKAawtsnSkuCYidRs7tw
## alias
## 1 medstar-georgetown-pediatrics-washington-2
## 2 georgetown-university-hospital-washington-2
## 3 georgetown-university-hospital-washington-4
## 4 doctors-groover-christie-and-merritt-washington
## 5 kaiser-health-plan-of-the-mid-atlantic-states-washington
## 6 columbia-hospital-for-women-medical-center-washington
## name
## 1 MedStar Georgetown Pediatrics
## 2 Georgetown University Hospital
## 3 Georgetown University Hospital
## 4 Doctors Groover Christie & Merritt
## 5 Kaiser Health Plan of the MID-Atlantic States
## 6 Columbia Hospital For Women Medical Center
## image_url
## 1 https://s3-media3.fl.yelpcdn.com/bphoto/LELlyZqWQAMVqEd6JOF57Q/o.jpg
## 2
## 3
## 4
## 5 https://s3-media3.fl.yelpcdn.com/bphoto/_WEVQTve5o4VXw6Mb6Ii0Q/o.jpg
## 6
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/medstar-georgetown-pediatrics-washington-2?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 2 https://www.yelp.com/biz/georgetown-university-hospital-washington-2?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 3 https://www.yelp.com/biz/georgetown-university-hospital-washington-4?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 4 https://www.yelp.com/biz/doctors-groover-christie-and-merritt-washington?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 5 https://www.yelp.com/biz/kaiser-health-plan-of-the-mid-atlantic-states-washington?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## 6 https://www.yelp.com/biz/columbia-hospital-for-women-medical-center-washington?adjust_creative=SY58PgLFZDsd-jyTBEFDIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=SY58PgLFZDsd-jyTBEFDIQ
## review_count categories rating transactions phone
## 1 20 Hospitals, Pediatricians 1.5 +12022433400
## 2 1 Doctors, Hospitals 3.0 +12026861316
## 3 1 Hospitals 4.0 +12024442000
## 4 1 Hospitals 2.0 +12025374781
## 5 1 Hospitals 4.0 +12023463550
## 6 3 Hospitals 5.0 +12026590240
## display_phone distance location.address1 location.address2
## 1 (202) 243-3400 448.3399 4200 Wisconsin Ave NW Fl 4
## 2 (202) 686-1316 845.8368 3301 New Mexico Ave NW
## 3 (202) 444-2000 524.4550
## 4 (202) 537-4781 524.4550
## 5 (202) 346-3550 1708.9989 700 Second St NE
## 6 (202) 659-0240 439.7342 2440 M St NW Ste 224
## location.address3 location.city location.zip_code location.country
## 1 Washington, DC 20016 US
## 2 Washington, DC 20016 US
## 3 Washington, DC 20001 US
## 4 Washington, DC 20001 US
## 5 Fl 6- Nephrology Washington, DC 20002 US
## 6 Washington, DC 20037 US
## location.state location.display_address
## 1 DC 4200 Wisconsin Ave NW, Fl 4, Washington, DC 20016
## 2 DC 3301 New Mexico Ave NW, Washington, DC 20016
## 3 DC Washington, DC 20001
## 4 DC Washington, DC 20001
## 5 DC 700 Second St NE, Fl 6- Nephrology, Washington, DC 20002
## 6 DC 2440 M St NW, Ste 224, Washington, DC 20037
## GEOID hhincome race.tot race.white race.black
## 1 11001001002 140772 3489 2710 211
## 2 11001000803 121417 2523 1800 160
## 3 11001004801 109453 3138 1732 1116
## 4 11001004802 118500 3447 1325 1213
## 5 11001010603 117945 1864 1501 303
## 6 11001005503 173255 2256 1895 76
## geometry
## 1 POINT (-77.0778 38.94356)
## 2 POINT (-77.08518 38.93365)
## 3 POINT (-77.01809 38.91035)
## 4 POINT (-77.01755 38.90645)
## 5 POINT (-77.00397 38.89949)
## 6 POINT (-77.05239 38.90417)
tm_shape(census_yelp %>% group_by(GEOID) %>% summarise(rating=mean(rating))) +
tm_polygons(col = "rating", style = "quantile")
tm_shape(yelp_census) + tm_dots(col="hhincome")
yelp_in %>%
# Use mutate bc the re-coded variable is a new variable
mutate(review_count_binary = case_when(review_count > 1000 ~ "many",
review_count <= 1000 ~ "few")) %>%
# Select these two columns to simplify the print out
select(review_count, review_count_binary) %>%
head()
## Simple feature collection with 6 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -77.08518 ymin: 38.89949 xmax: -77.00397 ymax: 38.94356
## Geodetic CRS: WGS 84
## review_count review_count_binary geometry
## 1 20 few POINT (-77.0778 38.94356)
## 2 1 few POINT (-77.08518 38.93365)
## 3 1 few POINT (-77.01809 38.91035)
## 4 1 few POINT (-77.01755 38.90645)
## 5 1 few POINT (-77.00397 38.89949)
## 6 3 few POINT (-77.05239 38.90417)
yelp_in %>%
mutate(across(is.numeric, scale)) %>%
select(is.numeric)
## Warning: There was 1 warning in `stopifnot()`.
## ℹ In argument: `across(is.numeric, scale)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.numeric)
##
## # Now:
## data %>% select(where(is.numeric))
## Simple feature collection with 27 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -77.10924 ymin: 38.84792 xmax: -76.93404 ymax: 38.96691
## Geodetic CRS: WGS 84
## First 10 features:
## review_count rating distance geometry
## 1 -0.3411455 -1.28389670 -0.53237460 POINT (-77.0778 38.94356)
## 2 -0.5540502 -0.04585345 0.05530671 POINT (-77.08518 38.93365)
## 3 -0.5540502 0.77950871 -0.41984178 POINT (-77.01809 38.91035)
## 4 -0.5540502 -0.87121562 -0.41984178 POINT (-77.01755 38.90645)
## 5 -0.5540502 0.77950871 1.33145324 POINT (-77.00397 38.89949)
## 6 -0.5316391 1.60487088 -0.54509770 POINT (-77.05239 38.90417)
## 7 -0.4307896 -0.45853454 -0.76419672 POINT (-76.99583 38.89308)
## 8 -0.5540502 1.60487088 -0.78577330 POINT (-76.99536 38.89233)
## 9 -0.5540502 1.60487088 -0.61267556 POINT (-77.02704 38.96691)
## 10 0.7569944 -1.28389670 -0.28692399 POINT (-77.02083 38.91774)
# Using table
yelp_in %>%
pull(rating) %>% table
## .
## 1 1.5 2 2.5 3 3.5 4 5
## 1 2 5 5 4 2 3 5
# Using count
yelp_in %>%
count(rating)
## Simple feature collection with 8 features and 2 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: -77.10924 ymin: 38.84792 xmax: -76.93404 ymax: 38.96691
## Geodetic CRS: WGS 84
## rating n geometry
## 1 1.0 1 POINT (-77.07057 38.91999)
## 2 1.5 2 MULTIPOINT ((-77.02083 38.9...
## 3 2.0 5 MULTIPOINT ((-77.05048 38.9...
## 4 2.5 5 MULTIPOINT ((-77.07562 38.9...
## 5 3.0 4 MULTIPOINT ((-77.04934 38.8...
## 6 3.5 2 MULTIPOINT ((-77.04538 38.9...
## 7 4.0 3 MULTIPOINT ((-77.07533 38.9...
## 8 5.0 5 MULTIPOINT ((-77.05239 38.9...
‘nrow before: 166 -> nrow after: 27’ so we know data have 4271 rows after cleaning. ‘ncol before: 15 -> ncol after: 22’, and some columns are flanted, generating six new columns.
The most frequent rating is 2.
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
result <- getmode(yelp_in$rating)
print(result)
## [1] 2
More the rating is close to 4, the more review_count are.
plot(yelp_in$rating, yelp_in$review_count, main = "Main title",
xlab = "X axis title", ylab = "Y axis title",
)
Different joint orders give different numbers of rows of results. census_yelp: 215 yelp_census: 27
In terms of the distribution pattern of eye’s visual, high-scoring hospitals and low-scoring hospitals cluster together. In other words, hospitals within a certain range near low-rated hospitals are more likely to have low-rated hospitals.