setwd( "/Users/kush/Desktop/BANA stuff/BANA 4137 Descriptive Analytics/Final Project /FinalProject")
marketing <- read_delim("marketing_campaign.csv",
"\t", escape_double = FALSE, trim_ws = TRUE)
marketing <- marketing %>% mutate(children = Kidhome + Teenhome, teen = ifelse(Teenhome >0, TRUE, FALSE))
Our objective when analyzing the marketing campaign data is to identify trends or relationships to help us better understand the data set. We want to measure whether certain variables are related with one another or find the hidden answers behind how one variable affects the others using visualizations. We want to see the interactions of variables in the datasett to get a good picture of the data. We also are interested to see how Education, Marital Status is related to income, or how having a teen at home vs a kid at home may indicate higher purchases of certain products. We also want to find out whether certain traits of people make them more likely to complain.
What is the relationship between education level and household income?
How is flow of income based on Education and Marital Status? (Sankey Diagram), what is its relationship?
What are the most popular grocery items?
Is there a correlation between age and number of web visits per month?
What is the relationship between household income and web/store purchases per month?
Is there a relationship between age and marital status?
Which Variable correlate the most with Income?
Do certain types of people complain more than the other? Those with kids, or rich people etc
Below are the first 6 rows for each variable contained in the marketing campaign data set. We downloaded this data set from kaggle.com
head(marketing)
## # A tibble: 6 x 31
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## # … with 23 more variables: Recency <dbl>, MntWines <dbl>, MntFruits <dbl>,
## # MntMeatProducts <dbl>, MntFishProducts <dbl>, MntSweetProducts <dbl>,
## # MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## # NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## # NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## # AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>, Complain <dbl>,
## # Z_CostContact <dbl>, Z_Revenue <dbl>, Response <dbl>, children <dbl>,
## # teen <lgl>
dim(marketing)
## [1] 2240 31
colSums(is.na(marketing))
## ID Year_Birth Education Marital_Status
## 0 0 0 0
## Income Kidhome Teenhome Dt_Customer
## 24 0 0 0
## Recency MntWines MntFruits MntMeatProducts
## 0 0 0 0
## MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
## 0 0 0 0
## NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
## 0 0 0 0
## AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1
## 0 0 0 0
## AcceptedCmp2 Complain Z_CostContact Z_Revenue
## 0 0 0 0
## Response children teen
## 0 0 0
Correlation plots of Products
df_products = marketing[c('MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')]
pairs.panels(df_products)
marketing %>%
summarise_all(list(~is.na(.)))%>%
pivot_longer(everything(),
names_to = "variables", values_to="missing") %>%
count(variables, missing) %>%
ggplot(aes(y=variables,x=n,fill=missing))+
geom_col()
options(scipen = 999)
marketing %>% drop_na() %>% filter(Income < 500000) %>% ggplot(aes(Education, Income, color = Marital_Status)) +
geom_beeswarm(size=1) +
geom_boxplot(color = 'red', alpha = .5) +
scale_color_viridis_d()
#Historgram of Distribution of birthyear
ggplot(marketing, aes(Year_Birth)) +
geom_histogram(binwidth = 10, bins = 4, fill = "orange3")
AveIncome <- marketing %>% group_by(Education) %>% drop_na() %>% summarise(AverageIncome = mean(Income))
p2 <- AveIncome %>%
plot_ly(labels = ~Education,
values = ~AverageIncome,
marker = list(colors = c("red","blue","purple","yellow", "orange"))) %>%
add_pie(hole = 0.2) %>%
layout(xaxis = list(zeroline = F,
showline = F,
showticklabels = F,
showgrid = F),
yaxis = list(zeroline = F,
showline = F,
showticklabels=F,
showgrid=F))
p2
#Flow of Education to Marital Status by income
AveIncomeMarital <- marketing %>% group_by(Education, Marital_Status) %>% drop_na() %>% summarise(AverageIncome = mean(Income))
finalsankey <- AveIncomeMarital
library(networkD3)
colnames(finalsankey) <- c("source", "target", "value")
finalsankey$target <- paste(finalsankey$target, " ", sep="")
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(name=c(as.character(finalsankey$source), as.character(finalsankey$target)) %>% unique())
# With networkD3, connection must be provided using id, not using real name like in the links dataframe.. So we need to reformat it.
finalsankey$IDsource=match(finalsankey$source, nodes$name)-1
finalsankey$IDtarget=match(finalsankey$target, nodes$name)-1
# prepare colour scale
ColourScal ='d3.scaleOrdinal() .range(["#FDE725FF","#B4DE2CFF","#6DCD59FF","#35B779FF","#1F9E89FF","#26828EFF","#31688EFF","#3E4A89FF","#482878FF","#440154FF"])'
# Make the Network
sankeyNetwork(Links = finalsankey, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name",
sinksRight=FALSE, colourScale=ColourScal, nodeWidth=40, fontSize=13, nodePadding=20)
corr_var(marketing, Income, top = 12)
ggplot(marketing) + geom_boxplot(aes(factor(Complain), Year_Birth)) +
ggtitle("Complains by Age") +
theme_bw()
marketing %>% filter(Income < 500000) %>% ggplot() + geom_boxplot(aes(factor(Complain), Income)) +
ggtitle("Complaints by Income")
ggplot(marketing) + geom_boxplot(aes(factor(Complain), children)) +
ggtitle("Complaints by Number of Children")
Those with Children tend to complain more than others.
setwd("/Users/kush/Desktop/BANA stuff/BANA 4137 Descriptive Analytics/Final Project /FinalProject")
police <- read.csv("PDI__Police_Data_Initiative__Police_Calls_for_Service__CAD_.csv")
Our main objective of this analysis is to use map data to show Map of Downtown, with disposition text = offense report and incident related to auto theft. We want to use longitude and latitude and figure our where auto theft occus in Downtown area. Another one of our objective is to look at downtown area with the street address containing “Vine” and check the proportionality based on priority color. We want to show that with a map, we can gett a lot of useful information and pinpoint locatations and create great visualizations. We want to also see where gun usage is most prevalent and to educate people on areas to avoid. We want to analyse this dataset to increase the safety of all the Barcats.
** What are the longitude and latitude of Downtown locations where the disposition text have “offense report”, while he incident was related to auto theft? **
** What is the proporion of Winton Hills community, broken down by top 6 priority counts? **
** What are the count of agency types on the dataset? Are there differing agencies? How many incidents involved a gun and in which streets? **
“ADDRESS_X” - This attribute is the street name that the incident occurred on.
“LATITUDE_X” - The latitude coordinates of the incident.
“LONGITUDE_X” - The longitude coordinates of the incident.
“AGENCY” - The public safety department that repsonded to the incident
“CREATE_TIME_INCIDENT” - This attribute is the date/timestamp (yyyy mmmm dd hh:mm:ss) when the response data was submitted into the CAD system.
“DISPOSITION_TEXT” - The disposition of each incident is the outcome of the incident response.
“EVENT_NUMBER” - CAD incident number related to the incident
“INCIDENT_TYPE_ID” - The problem code of the incident that is used to describe the issue.
“INCIDENT_TYPE_DESC” - The text description of the corresponding problem code (Incident_Type_Id).
“PRIORITY” - The integer code of the repsonse priority. This determines the urgency of the incident.
“PRIORITY_COLOR” -
“ARRIVAL_TIME_PRIMARY_UNIT” - The time of arrival of the first CPD unit on scene.
“CLOSED_TIME_INCIDENT” - This attribute is the date/timestamp (yyyy mmmm dd hh:mm:ss) when the incident is marked complete in the CAD system
“DISPATCH_TIME_PRIMARY_UNIT” - The time of dispatch of the first CPD unit on scene.
“BEAT” - Smaller areas that make up the district
“COMMUNITY_COUNCIL_NEIGHBORHOOD” - The listed neighborhood of the incident using community council defined boundaries. This attribute may differ from the SNA neighborhood depending on the location of the incident. “DISTRICT” - District of the incident response
“SNA_NEIGHBORHOOD” - The listed neighborhood of the incident using the Statistical Neighborhood Approximations (SNA). This neighborhood classification is the underlying reporting area for all City data.
“CPD_NEIGHBORHOOD” - The listed neighborhood of the incident using CPD defined boundaries. This attribute may differ from the SNA neighborhood & Community Council neighborhood depending on the location of the incident.
Below are the first 6 rows for each variable contained in the marketing campaign data set. We downloaded this data set from kaggle.com
head(police)
## ADDRESS_X LATITUDE_X LONGITUDE_X AGENCY CREATE_TIME_INCIDENT
## 1 84XX DESOTO NA NA CP 10/01/2014 10:26:34 PM
## 2 W LIBERTY ST/WESTERN AV NA NA CP 10/01/2014 10:26:55 PM
## 3 OHIO AV/WARNER ST NA NA CP 10/01/2014 10:28:03 PM
## 4 S I75 EX/W MITCHELL AV NA NA CP 10/01/2014 10:30:23 PM
## 5 GEST ST/STATE AV NA NA CP 10/01/2014 10:29:09 PM
## 6 CAMPUS LN/SUTTON AV NA NA CP 10/01/2014 10:32:14 PM
## DISPOSITION_TEXT EVENT_NUMBER INCIDENT_TYPE_ID INCIDENT_TYPE_DESC
## 1 ADV - ADVISED LCP141001001711 ADV
## 2 AST - ASSIST LCP141001001712 HAZARD
## 3 AST - ASSIST LCP141001001715 HAZARD
## 4 NTR - NOTHING TO REPORT LCP141001001717 SDET
## 5 ARR - ARREST LCP141001001718 WANT
## 6 HBF - HANDLED BY FIRE LCP141001001721 PERDWP
## PRIORITY PRIORITY_COLOR ARRIVAL_TIME_PRIMARY_UNIT CLOSED_TIME_INCIDENT
## 1 NA 10/01/2014 10:26:34 PM
## 2 NA 10/01/2014 10:26:55 PM 10/01/2014 10:28:48 PM
## 3 NA 10/01/2014 10:28:03 PM 10/01/2014 10:42:31 PM
## 4 NA 10/02/2014 12:03:13 AM 10/02/2014 06:38:09 AM
## 5 NA 10/01/2014 10:54:06 PM 10/01/2014 11:15:30 PM
## 6 NA 10/01/2014 10:40:19 PM 10/01/2014 10:49:01 PM
## DISPATCH_TIME_PRIMARY_UNIT BEAT COMMUNITY_COUNCIL_NEIGHBORHOOD DISTRICT
## 1 N/A NA
## 2 10/01/2014 10:26:55 PM P134 N/A NA
## 3 10/01/2014 10:28:03 PM P511 N/A NA
## 4 10/01/2014 10:30:30 PM P539 N/A NA
## 5 10/01/2014 10:32:45 PM P315 N/A NA
## 6 10/01/2014 10:38:12 PM P248 N/A NA
## SNA_NEIGHBORHOOD CPD_NEIGHBORHOOD
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
dim(police)
## [1] 3529361 19
colSums(is.na(police))
## ADDRESS_X LATITUDE_X
## 0 437081
## LONGITUDE_X AGENCY
## 437081 0
## CREATE_TIME_INCIDENT DISPOSITION_TEXT
## 0 0
## EVENT_NUMBER INCIDENT_TYPE_ID
## 0 0
## INCIDENT_TYPE_DESC PRIORITY
## 0 854684
## PRIORITY_COLOR ARRIVAL_TIME_PRIMARY_UNIT
## 0 0
## CLOSED_TIME_INCIDENT DISPATCH_TIME_PRIMARY_UNIT
## 0 0
## BEAT COMMUNITY_COUNCIL_NEIGHBORHOOD
## 0 0
## DISTRICT SNA_NEIGHBORHOOD
## 524422 3529361
## CPD_NEIGHBORHOOD
## 3529361
police %>%
summarise_all(list(~is.na(.)))%>%
pivot_longer(everything(),
names_to = "variables", values_to="missing") %>%
count(variables, missing) %>%
ggplot(aes(y=variables,x=n,fill=missing))+
geom_col()
agency <- police %>% group_by(AGENCY) %>%
summarize(count = n())
a <- agency %>% ggplot(aes(AGENCY, count, fill = AGENCY)) +
geom_col() +
theme_minimal() +
ggtitle("Cases handled per agency")
ggplotly(a)
##winton Hills community grouped by priority and top 6 counts of the priority numbers.
vine <- police %>% filter(COMMUNITY_COUNCIL_NEIGHBORHOOD == 'WINTON HILLS') %>% group_by(PRIORITY) %>% summarise(count = n()) %>% top_n(6, count)
p2 <- vine %>%
plot_ly(labels = ~PRIORITY,
values = ~count,
marker = list(colors = c("red","blue","purple","yellow", "orange", "green"))) %>%
add_pie(hole = 0.2) %>%
layout(xaxis = list(zeroline = F,
showline = F,
showticklabels = F,
showgrid = F),
yaxis = list(zeroline = F,
showline = F,
showticklabels=F,
showgrid=F))
p2
police$CREATE_TIME_INCIDENT <- mdy_hms(police$CREATE_TIME_INCIDENT)
police$create_time_incident_year_month <- year_month(police$CREATE_TIME_INCIDENT)
gun_streets <- police %>% select(LONGITUDE_X, LATITUDE_X, INCIDENT_TYPE_ID, ADDRESS_X, create_time_incident_year_month) %>%
filter(INCIDENT_TYPE_ID == "GUN" & create_time_incident_year_month >= "2021-09") %>% drop_na()
gun_streets %>% leaflet() %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lng= gun_streets$LONGITUDE_X, lat = gun_streets$LATITUDE_X, popup = paste0("Streets where a Gun was reported since September 2021 : ", gun_streets$INCIDENT_TYPE_ID, ", " , gun_streets$ADDRESS_X, sep = " ,"))
police %>%
rowwise()%>%
filter(INCIDENT_TYPE_ID == "GUN")%>%
count(ADDRESS_X,sort = TRUE)%>%
filter(n>40)%>%
mutate(ADDRESS_X = reorder(ADDRESS_X, n))%>%
ggplot(aes(ADDRESS_X,n)) + geom_col(fill = "Green") + xlab(NULL) + theme_minimal() + coord_flip() + ylab("Num. of Incendents Involving a Weapon") +xlab("Street Name")+ ggtitle("# of Events Involving Weapons by Street Corner")
Clifton Avenue involved Gun usage the most, followed by Mcmicken Av.
gun_streets2 <- police %>% select(LONGITUDE_X, LATITUDE_X, INCIDENT_TYPE_ID, ADDRESS_X, create_time_incident_year_month, PRIORITY) %>%
filter(INCIDENT_TYPE_ID %in% c("KIDS", "CHILD") & create_time_incident_year_month >= "2021-09") %>% drop_na()
colors <- c("orange", "green")
pal <- colorFactor(colors, gun_streets2$PRIORITY)
gun_streets2 %>% leaflet() %>%
addProviderTiles(providers$Stamen.TonerHybrid) %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lng= gun_streets$LONGITUDE_X, lat = gun_streets$LATITUDE_X, radius = ~6, color = ~pal(PRIORITY),opacity = .2, popup = paste0("Incident type involving kids or child : ", "<b>", gun_streets2$INCIDENT_TYPE_ID, "<br/>" , gun_streets2$ADDRESS_X, "<br/>",gun_streets2$PRIORITY)) %>%
addLegend(pal = pal, values = gun_streets2$PRIORITY, title = "Priority Level")
top10address <- police %>% select(ADDRESS_X, LONGITUDE_X, LATITUDE_X) %>% group_by(ADDRESS_X) %>% summarise(count = n()) %>% top_n(10, count) %>%
arrange(desc(count))
colorz <- c("yellow2", "green")
pallete <- colorFactor(colorz, top10address$count)
top10address$latitude <- c(39.11913, 39.10890, 39.12815, 39.15784, 39.14182, 39.19797,39.14441,39.12838,39.10526,39.13708)
top10address$longitude <- c(-84.54840 ,-84.52174,-84.60133,-84.47360,-84.42456 ,-84.54665, -84.49093, -84.51557 ,-84.54561, -84.50335)
top10address %>% leaflet() %>%
addProviderTiles(providers$Stamen.TonerHybrid) %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lng= top10address$longitude, lat = top10address$latitude, radius = ~6, color = ~pallete(count),opacity = .6, popup = paste0("Top 10 address where a incident was reported : ", "<b>", top10address$count, "<br/>" , top10address$ADDRESS_X)) %>%
addLegend(pal = pallete, values = top10address$count, title = "Incident count by street addreess", position = "topright") %>%
addPolylines(lng = top10address$longitude, lat = top10address$latitude, opacity = .4)