library(plyr) #loading plyr first then dplyr as there are conflicts in execution if done otherwise
library(tidyverse)
library(knitr)
library(jsonlite)
api_tkn <- "$$app_token=ZHxqxXyUSoZlBvzxpZsTT9QjG"
api_endpoint <- "https://data.cityofnewyork.us/resource/fhrw-4uyv.json?"
api_limit <- "&$limit=150000"
#api_filter <- "&borough=BRONX"
request_json <- fromJSON(paste0(api_endpoint, api_tkn, api_limit))
class(request_json)
## [1] "data.frame"
colnames(request_json)
## [1] "address_type" "agency"
## [3] "agency_name" "bbl"
## [5] "borough" "city"
## [7] "closed_date" "community_board"
## [9] "complaint_type" "created_date"
## [11] "cross_street_1" "cross_street_2"
## [13] "descriptor" "facility_type"
## [15] "incident_address" "incident_zip"
## [17] "latitude" "location"
## [19] "location_type" "longitude"
## [21] "open_data_channel_type" "park_borough"
## [23] "park_facility_name" "resolution_action_updated_date"
## [25] "resolution_description" "status"
## [27] "street_name" "unique_key"
## [29] "x_coordinate_state_plane" "y_coordinate_state_plane"
## [31] "due_date" "intersection_street_1"
## [33] "intersection_street_2" "taxi_pick_up_location"
## [35] "bridge_highway_direction" "bridge_highway_name"
## [37] "bridge_highway_segment" "road_ramp"
## [39] "taxi_company_borough" "landmark"
## [41] "vehicle_type"
nrow(request_json)
## [1] 150000
head(request_json,1)
## address_type agency agency_name
## 1 ADDRESS HPD Department of Housing Preservation and Development
## bbl borough city closed_date community_board
## 1 3050560064 BROOKLYN BROOKLYN 2011-11-25T00:00:00.000 09 BROOKLYN
## complaint_type created_date cross_street_1 cross_street_2
## 1 NONCONST 2011-11-21T00:00:00.000 BEDFORD AVENUE ROGERS AVENUE
## descriptor facility_type incident_address incident_zip
## 1 VERMIN N/A 181 CLARKSON AVENUE 11226
## latitude location.type location.coordinates
## 1 40.65519246467765 Point -73.95402, 40.65519
## location_type longitude open_data_channel_type
## 1 RESIDENTIAL BUILDING -73.95402339048265 UNKNOWN
## park_borough park_facility_name resolution_action_updated_date
## 1 BROOKLYN Unspecified 2011-11-25T00:00:00.000
## resolution_description
## 1 The Department of Housing Preservation and Development inspected the following conditions. Violations were issued. Information about specific violations is available at www.nyc.gov/hpd.
## status street_name unique_key x_coordinate_state_plane
## 1 Closed CLARKSON AVENUE 21931039 997007
## y_coordinate_state_plane due_date intersection_street_1
## 1 177984 <NA> <NA>
## intersection_street_2 taxi_pick_up_location bridge_highway_direction
## 1 <NA> <NA> <NA>
## bridge_highway_name bridge_highway_segment road_ramp
## 1 <NA> <NA> <NA>
## taxi_company_borough landmark vehicle_type
## 1 <NA> <NA> <NA>
#library(tidyverse)
dataset <- request_json
ggplot(subset(dataset, complaint_type %in% count(dataset, complaint_type, sort=T)[1:50,]$complaint_type), aes(complaint_type)) +
geom_histogram(stat = "count") +
labs(x="Complaint Type", y="Service Requests") +
coord_flip() + theme_bw()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
dataset_qckfilt <- subset(dataset, complaint_type %in% count(dataset, complaint_type, sort=T)[1:50,]$complaint_type)
nrow(dataset_qckfilt)
## [1] 142079
dataset_qckfilt <- dataset_qckfilt %>% select(complaint_type, borough, status)
ggplot(dataset_qckfilt, aes(x=status, y = complaint_type )) +
geom_point() +
geom_count() +
facet_wrap(~borough)
library(tidytext)
data(stop_words)
tokenized_resolutions <- dataset %>%
select(complaint_type, descriptor, street_name, city, due_date, resolution_description, borough, open_data_channel_type) %>%
filter(!str_detect(borough, "Unspecified")) %>%
unnest_tokens(word, resolution_description) %>%
anti_join(stop_words) %>%
group_by(borough, word) %>%
tally()
tokenized_resolutions %>% glimpse()
## Observations: 3,227
## Variables: 3
## $ borough <chr> "BRONX", "BRONX", "BRONX", "BRONX", "BRONX", "BRONX", ...
## $ word <chr> "1", "10", "120", "14", "152", "19", "21", "212", "287...
## $ n <int> 16, 33, 28, 72, 24, 24, 56, 853, 9, 24, 8, 367, 1770, ...
tokenized_resolutions %>%
group_by(borough) %>%
top_n(25) %>%
arrange(desc(n)) %>%
ggplot(aes(x = reorder(word, n), y = n, fill = factor(borough))) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
facet_wrap(~borough, scales = "free") +
coord_flip() +
labs(x = "Words",
y = "Frequency",
title = "Top words used in NYC311 Service Requests by Borough",
subtitle = "")
tf_idf_words <- tokenized_resolutions %>%
bind_tf_idf(word, borough, n) %>%
arrange(desc(tf_idf))
tf_idf_words
## # A tibble: 3,227 x 6
## # Groups: borough [5]
## borough word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 STATEN ISLAND pruning 42 0.000640 0.511 0.000327
## 2 MANHATTAN attend 420 0.000966 0.223 0.000216
## 3 QUEENS spring 42 0.000119 1.61 0.000191
## 4 STATEN ISLAND trees 22 0.000335 0.511 0.000171
## 5 STATEN ISLAND cycle 21 0.000320 0.511 0.000163
## 6 STATEN ISLAND prunes 21 0.000320 0.511 0.000163
## 7 STATEN ISLAND reviews 21 0.000320 0.511 0.000163
## 8 BRONX fi 269 0.000585 0.223 0.000131
## 9 STATEN ISLAND intensity 9 0.000137 0.916 0.000126
## 10 STATEN ISLAND rain 9 0.000137 0.916 0.000126
## # ... with 3,217 more rows
tf_idf_words_cln <- tf_idf_words %>% filter(!str_detect(word, "[[:digit:]]+")) %>%
filter(!str_detect(word, "[[:punct:]]+")) %>%
filter(!str_detect(word, "bronx|brooklyn|manhattan|queens|staten island"))
tf_idf_words_cln %>%
top_n(25) %>%
arrange(desc(tf_idf)) %>%
ggplot(aes(x = reorder(word, tf_idf), y = tf_idf, fill = borough)) +
geom_col() +
labs(x = "Words", y = "tf-idf",
title = "Distinctive words used in NYC311 Service Requests by Borough",
subtitle = "") +
coord_flip() +
theme(legend.position = "none") +
facet_wrap(~ borough, scales = "free")
## Selecting by tf_idf
dataset_map <- subset(dataset, complaint_type %in% count(dataset, complaint_type, sort=T)[1:50,]$complaint_type)
dataset_map <- dataset_map %>% select(complaint_type, borough, latitude, longitude) %>% drop_na()
#library(plyr)
counts <- ddply(dataset_map, .(complaint_type), "count")
counts_filtered <- filter(counts, freq > 80)
counts_filtered$freq <- as.numeric(counts_filtered$freq)
counts_filtered$longitude <- as.numeric(counts_filtered$longitude)
counts_filtered$latitude <- as.numeric(counts_filtered$latitude)
#install.packages("rworldmap")
#install.packages("rworldxtra")
library(rworldmap)
library(rworldxtra)
newmap <- getMap(resolution = "high")
#nyc_coorflimits <- data.frame( long = c(-74.5, -73.5), lat = c(40.5, 41), stringsAsFactors = FALSE)
nyc <- ggplot() + geom_polygon(data = newmap, aes(x=long, y = lat, group = group), fill = "gray", color = "blue") + xlim(-74.5, -73.5) + ylim(40.5, 41)
nyc_SRs <- nyc +
geom_point(data=counts_filtered, aes(longitude, latitude, size=freq), colour="red") +
# facet_wrap(~complaint_type, scales = "free") + (Working on the console, not working on RMarkdown)
labs(x = "Longitude", y = "Latitude", title = "Highest Number of SRs by Complaint Type") + scale_size(name="# of SRs")
nyc_SRs
library("png")
pp <- readPNG("MapSRsbyComplType.png")
plot.new()
rasterImage(pp,0,0,1,1)
library(tm)
library(wordcloud)
dataset_filt <- subset(dataset, complaint_type %in% count(dataset, complaint_type, sort=T)[1:50,]$complaint_type)
sr_resolution <- dataset_filt$resolution_description
sr_resolution_cln <- sr_resolution %>% iconv("latin1", "ASCII")
control <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=5) # stemming=TRUE does not provide much value
sr_corpus <- VCorpus(VectorSource(sr_resolution_cln))
sr_tdm <- TermDocumentMatrix(sr_corpus, control)
sr_tdm
## <<TermDocumentMatrix (terms: 786, documents: 142079)>>
## Non-/sparse entries: 1820508/109853586
## Sparsity : 98%
## Maximal term length: 21
## Weighting : term frequency (tf)
sr_tdm_unsprsd <- removeSparseTerms(sr_tdm, 0.8)
sr_tdm_unsprsd
## <<TermDocumentMatrix (terms: 18, documents: 142079)>>
## Non-/sparse entries: 961420/1596002
## Sparsity : 62%
## Maximal term length: 12
## Weighting : term frequency (tf)
length(findFreqTerms(sr_tdm_unsprsd,50))
## [1] 18
sr_topterms <- findFreqTerms(sr_tdm_unsprsd,50)
sr_topterms
## [1] "closed" "complaint" "condition" "conditions"
## [5] "department" "development" "exists" "file"
## [9] "following" "housing" "information" "inspected"
## [13] "issued" "new" "please" "preservation"
## [17] "still" "violations"
sr_topterms <- sr_topterms[!is.na(sr_topterms)]
sr_assocs <- findAssocs(sr_tdm_unsprsd, sr_topterms[1:5], 0.4)
lapply(sr_assocs, function(x) kable(x))
## $closed
##
##
## x
## ------------- -----
## development 0.74
## housing 0.74
## preservation 0.74
## following 0.70
## still 0.57
## exists 0.53
## file 0.43
##
## $complaint
##
##
## x
## ------------- -----
## still 0.72
## exists 0.64
## please 0.53
## file 0.48
## development 0.43
## housing 0.43
## preservation 0.43
##
## $condition
##
##
## x
## ------- -----
## still 0.71
## exists 0.68
## file 0.54
## please 0.42
##
## $conditions
##
##
## x
## ------------- -----
## following 0.96
## development 0.82
## housing 0.82
## preservation 0.82
## violations 0.57
## issued 0.52
## inspected 0.43
##
## $department
##
##
## | x|
## |--:|
library(wordcloud)
sr_tdm_cloud <- as.matrix(sr_tdm_unsprsd)
v <- sort(rowSums(sr_tdm_cloud),decreasing=TRUE)
d <- data.frame(word=names(v),freq=v)
wordcloud(d$word,d$freq,max.words=50, min.freq=10, colors=brewer.pal(8, 'Dark2'))
library(rtweet)
##appname <- "nyc311sentiment_analysis"
## key <- "12345678901234567890"
##secret <- "12345678901234567890abcdefghijk"
# create token named "twitter_token"
##twitter_token <- rtweet::create_token(app = appname,
## consumer_key = key,
## consumer_secret = secret)
##home_directory <- "C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir"
##file_name <- file.path(home_directory,
## "twitter_token.rds")
## save token to home directory
##saveRDS(twitter_token, file = file_name)
## create and save environment variable
##cat(paste0("TWITTER_PAT=", file_name),
## file = file.path(home_directory, ".Renviron"),
## append = TRUE)
nyc311_tweets <- search_tweets("nyc311", n = 1100, include_rts = FALSE)
head(nyc311_tweets$text,5) %>% kable()
x |
---|
@KezNat @NYPD94Pct @NYCMayorsOffice I understand your concern. You may file an Illegal Parking Complaint here: https://t.co/xHlOQoqZwu or kindly DM me and I will gladly file it on your behalf. |
@alf300 @NYC_DOT @nycgov @NYCMayor @NYCMayorsOffice @DanRosenthalNYC @RoryLancman Sorry for the disturbance. You may file an After-Hours Construction Noise Complaint here: https://t.co/2KULvb99GG or kindly DM me to file it on your behalf. |
.@NYCDHS Codigo azul esta en efecto hasta mañana, viernes, 7 de diciembre a las 8:00 AM. Si vez a algien sin hogar durante esta temperatura baja, por favor llámanos al 311 https://t.co/kYFdEOazeH |
@PastorKebreau Thank you for bringing this to our attention. Kindly DM me and I will gladly assist you further. |
@GabrielleDeP20 I understand your concern. For privately owned buildings, you may file a Smoking Complaint here: https://t.co/Pb21Is93Og or DM me to file it for you. For NYCHA building, you must reach out to NYCHA Customer Contact Center at 718-707-7771 https://t.co/3AllJpYxhR. |
nyc311_users <- users_data(nyc311_tweets) %>% unique()
kable(head(head(nyc311_users[,c(3,4,8,9)])))
name | location | followers_count | friends_count |
---|---|---|---|
New York City 311 | New York City | 323035 | 234 |
OLC-11249 | New York City | 84 | 308 |
Pastor Adlerette Kebreau | usa | 248 | 746 |
Gabrielle DePalo | Manhattan, NY | 21 | 26 |
Chevi Friedman | 74 | 166 | |
SamBen | 5 | 24 |
ts_plot(nyc311_tweets, "24 hours", col=c("blue")) + theme_minimal() + theme(plot.title = ggplot2::element_text(face = "bold")) + labs(x = "Date", y = "# of Tweets", title = "NYC311 Tweets in the last 7 days")
#devtools::install_github("mjockers/syuzhet")
library(syuzhet)
nyc311_tweets_txt <- as.vector(nyc311_tweets$text)
emotion_df <- get_nrc_sentiment(nyc311_tweets_txt)
twt_emotion_df <- cbind(nyc311_tweets_txt, emotion_df)
kable(head(twt_emotion_df,3))
nyc311_tweets_txt | anger | anticipation | disgust | fear | joy | sadness | surprise | trust | negative | positive |
---|---|---|---|---|---|---|---|---|---|---|
@KezNat @NYPD94Pct @NYCMayorsOffice I understand your concern. You may file an Illegal Parking Complaint here: https://t.co/xHlOQoqZwu or kindly DM me and I will gladly file it on your behalf. | 2 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 2 | 0 |
@alf300 @NYC_DOT @nycgov @NYCMayor @NYCMayorsOffice @DanRosenthalNYC @RoryLancman Sorry for the disturbance. You may file an After-Hours Construction Noise Complaint here: https://t.co/2KULvb99GG or kindly DM me to file it on your behalf. | 2 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 3 | 0 |
.@NYCDHS Codigo azul esta en efecto hasta mañana, viernes, 7 de diciembre a las 8:00 AM. Si vez a algien sin hogar durante esta temperatura baja, por favor llámanos al 311 https://t.co/kYFdEOazeH | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
sent.value <- get_sentiment(nyc311_tweets_txt)
positive.tweets <- nyc311_tweets_txt[sent.value > 0]
kable(head(positive.tweets,5))
x |
---|
@PastorKebreau Thank you for bringing this to our attention. Kindly DM me and I will gladly assist you further. |
@jpicaro74 @NYPD121Pct Good morning and thanks for the pictures. If this is dangerous, call 911. If this is not dangerous, you can report an abandoned vehicle without license plates here: https://t.co/wJuf3Ldc5Y or DM me if you need assistance with filing. https://t.co/hDTCu9JZPB |
@NYC_DOT Thank you for looping us in. Hi, @Mason_Transport you may report illegal parking in progress here: https://t.co/Xyto97vssS or you can DM me for assistance with filing. https://t.co/hDTCu9JZPB |
@NYC_DOT Thank you for looping us in. Hi, @arnie09527558 @NYCMayor @BilldeBlasio @nycgov @NYCMayorsOffice @NotifyNYC @NYCHousing you can report visible dense smoke in the air from a rooftop chimney here: https://t.co/dZXcJB864h or DM me & I’ll file. https://t.co/hDTCu9JZPB |
@GNAnegativeaxis @NYCHRA @NYGovCuomo @NYCMayor Good evening, if you feel you have been wronged. You may file an Agency Feedback complaint to the Commissioner for Human Resources Administration here: https://t.co/SNA3Y7aapp or DM me for assistance filing. |
most.positive <- nyc311_tweets_txt[sent.value == max(sent.value)]
most.positive
## [1] "Find the perfect holiday event with the #311Weekend! \n\nTake a photo with Santa Claus at @NYCParks Holiday Blvd Park, light a tree & drink cocoa at Holiday on the Hudson, and listen to your favorite tuba (yes tuba!) Christmas classics at @rockcenternyc right under the big tree. https://t.co/Gh594LZKpL"
negative.tweets <- nyc311_tweets_txt[sent.value < 0]
kable(head(negative.tweets,5))
x |
---|
@KezNat @NYPD94Pct @NYCMayorsOffice I understand your concern. You may file an Illegal Parking Complaint here: https://t.co/xHlOQoqZwu or kindly DM me and I will gladly file it on your behalf. |
@alf300 @NYC_DOT @nycgov @NYCMayor @NYCMayorsOffice @DanRosenthalNYC @RoryLancman Sorry for the disturbance. You may file an After-Hours Construction Noise Complaint here: https://t.co/2KULvb99GG or kindly DM me to file it on your behalf. |
.@NYCDHS Codigo azul esta en efecto hasta mañana, viernes, 7 de diciembre a las 8:00 AM. Si vez a algien sin hogar durante esta temperatura baja, por favor llámanos al 311 https://t.co/kYFdEOazeH |
@GabrielleDeP20 I understand your concern. For privately owned buildings, you may file a Smoking Complaint here: https://t.co/Pb21Is93Og or DM me to file it for you. For NYCHA building, you must reach out to NYCHA Customer Contact Center at 718-707-7771 https://t.co/3AllJpYxhR. |
.@NYCDHSs Code Blue is in effect until tomorrow, Thursday, December 6 at 8:00 AM. If you see a homeless person outside in these frigid temperatures, please call us at 311. https://t.co/WYFH5ahuA6 |
most.negative <- nyc311_tweets_txt[sent.value <= min(sent.value)]
most.negative
## [1] "@nyc311 Im bringing it to your attention before someone is god forbid robbed, killed or raped inside this section of this abandoned home! This act before something unfortunately bad occurs!"
neutral.tweets <- nyc311_tweets_txt[sent.value == 0]
kable(head(neutral.tweets,5))
x |
---|
Ayuda a @NYCSanitation este invierno a mantener NYC limpio. Regístrese para ser un trabajador de la nieve y reciba un salario de $15 por hora. Vea los requerimientos y como aplicar en línea aquí: https://t.co/LzfHQ3FRxc https://t.co/PvprczRXYM |
¿Listo para poner el árbol navideño? @FDNY quiere acordarle que su seguridad es primordial durante esta época Navideña. Cables y enchufes eléctricos son la causa de más de mitad de todos los incendios de árboles navideños. Aprenda como mantenerse seguro: https://t.co/INlY3rJuMk https://t.co/e4p0sppy8A |
@NYC_DG Thank you for bringing this to our attention. You can file a Poster and Sign Complaint here https://t.co/HL3PFECiAi or DM me and I will file it on your behalf. |
@deuchyle Thank you for bringing this to our attention. You can file a Noise from a Business Complaint here https://t.co/vQgxrkoZC7 or you can DM me and I will file it on your behalf. |
Obtenga cobertura médica NYC a través de @NYStateofHealth. El periodo de inscripción está abierto ahora. Inscríbase antes del 15 de diciembre para obtener cobertura que comienza el 1 de enero de 2019. Aprenda más información aquí: https://t.co/QCNbI3BhrP https://t.co/7NfOHz4BNC |
#install.packages("plotly")
library(plotly)
category_sent <- ifelse(sent.value < 0, "Negative", ifelse(sent.value > 0, "Positive", "Neutral"))
totals <- data.frame(table(category_sent))
plot_ly(totals, labels = ~category_sent, values = ~Freq, type = 'pie', textinfo = 'label+percent') %>% layout(title = 'NYC311 Tweets by Sentiment')