# load data
library(readr)
options(scipen = 999)
df<-data.frame(read.csv("NYPD_Complaint_Data_Current__Year_To_Date_.csv"))
head(df)
## CMPLNT_NUM ADDR_PCT_CD BORO_NM CMPLNT_FR_DT CMPLNT_FR_TM CMPLNT_TO_DT
## 1 314773184 48 BRONX 12/31/2019 18:00:00
## 2 289837961 25 MANHATTAN 12/30/2019 20:30:00 12/31/2019
## 3 535744284 77 BROOKLYN 12/24/2019 16:55:00 12/24/2019
## 4 895678119 52 BRONX 12/30/2019 19:32:00
## 5 299841674 18 MANHATTAN 12/30/2019 15:30:00 12/30/2019
## 6 136697381 94 BROOKLYN 12/28/2019 13:00:00 12/29/2019
## CMPLNT_TO_TM CRM_ATPT_CPTD_CD HADEVELOPT HOUSING_PSA JURISDICTION_CODE
## 1 COMPLETED NA 0
## 2 10:00:00 COMPLETED NA 0
## 3 17:00:00 COMPLETED NA 0
## 4 COMPLETED NA 0
## 5 16:50:00 COMPLETED NA 0
## 6 8:30:00 COMPLETED NA 0
## JURIS_DESC KY_CD LAW_CAT_CD LOC_OF_OCCUR_DESC OFNS_DESC PARKS_NM
## 1 N.Y. POLICE DEPT 105 FELONY ROBBERY
## 2 N.Y. POLICE DEPT 341 MISDEMEANOR INSIDE PETIT LARCENY
## 3 N.Y. POLICE DEPT 106 FELONY FELONY ASSAULT
## 4 N.Y. POLICE DEPT 341 MISDEMEANOR INSIDE PETIT LARCENY
## 5 N.Y. POLICE DEPT 341 MISDEMEANOR PETIT LARCENY
## 6 N.Y. POLICE DEPT 341 MISDEMEANOR INSIDE PETIT LARCENY
## PATROL_BORO PD_CD PD_DESC
## 1 PATROL BORO BRONX 386 ROBBERY,PERSONAL ELECTRONIC DEVICE
## 2 PATROL BORO MAN NORTH 338 LARCENY,PETIT FROM BUILDING,UN
## 3 PATROL BORO BKLYN NORTH 109 ASSAULT 2,1,UNCLASSIFIED
## 4 PATROL BORO BRONX 333 LARCENY,PETIT FROM STORE-SHOPL
## 5 PATROL BORO MAN SOUTH 301 LARCENY,PETIT BY ACQUIRING LOS
## 6 PATROL BORO BKLYN NORTH 349 LARCENY,PETIT OF LICENSE PLATE
## PREM_TYP_DESC RPT_DT STATION_NAME SUSP_AGE_GROUP SUSP_RACE
## 1 STREET 12/31/2019 UNKNOWN UNKNOWN
## 2 RESIDENCE - APT. HOUSE 12/31/2019
## 3 STREET 12/31/2019 UNKNOWN BLACK
## 4 CHAIN STORE 12/30/2019 25-44 WHITE
## 5 STREET 12/30/2019
## 6 STREET 12/30/2019
## SUSP_SEX TRANSIT_DISTRICT VIC_AGE_GROUP VIC_RACE VIC_SEX X_COORD_CD
## 1 U NA 45-64 WHITE HISPANIC F 1016990
## 2 NA 25-44 WHITE HISPANIC F 999296
## 3 M NA 18-24 WHITE M 1001068
## 4 F NA UNKNOWN UNKNOWN D 1009690
## 5 NA 25-44 WHITE HISPANIC M 985766
## 6 NA UNKNOWN UNKNOWN D 997964
## Y_COORD_CD Latitude Longitude Lat_Lon
## 1 244612 40.83803 -73.88168 (40.838026269000075, -73.88168118799997)
## 2 230862 40.80033 -73.94566 (40.800334261000046, -73.94565697199994)
## 3 183317 40.66983 -73.93938 (40.66983179600004, -73.93937555099996)
## 4 257590 40.87367 -73.90801 (40.87367103500002, -73.90801364899994)
## 5 216503 40.76094 -73.99453 (40.76093528000007, -73.99452906599998)
## 6 205703 40.73128 -73.95052 (40.73128148300003, -73.95051933399996)
names(df)
## [1] "CMPLNT_NUM" "ADDR_PCT_CD" "BORO_NM"
## [4] "CMPLNT_FR_DT" "CMPLNT_FR_TM" "CMPLNT_TO_DT"
## [7] "CMPLNT_TO_TM" "CRM_ATPT_CPTD_CD" "HADEVELOPT"
## [10] "HOUSING_PSA" "JURISDICTION_CODE" "JURIS_DESC"
## [13] "KY_CD" "LAW_CAT_CD" "LOC_OF_OCCUR_DESC"
## [16] "OFNS_DESC" "PARKS_NM" "PATROL_BORO"
## [19] "PD_CD" "PD_DESC" "PREM_TYP_DESC"
## [22] "RPT_DT" "STATION_NAME" "SUSP_AGE_GROUP"
## [25] "SUSP_RACE" "SUSP_SEX" "TRANSIT_DISTRICT"
## [28] "VIC_AGE_GROUP" "VIC_RACE" "VIC_SEX"
## [31] "X_COORD_CD" "Y_COORD_CD" "Latitude"
## [34] "Longitude" "Lat_Lon"
The data set loaded has all cases to date as of 12/31/2019. Since data has essentially five age ranges known
<18 ,
18-24,
25-44,
45-64
and 65+
and remaining are unknown age groups I will only filter out the known age groups
research_data<-subset(df,df$SUSP_AGE_GROUP=="<18"
| df$SUSP_AGE_GROUP=="18-24"
|df$SUSP_AGE_GROUP=="25-44"
|df$SUSP_AGE_GROUP=="45-64"
|df$SUSP_AGE_GROUP=="65+"
)
research_data$lower_age<- -1
research_data$upper_age<- -1
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="<18" & research_data$lower_age<0, 0,research_data$lower_age)
research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="<18" & research_data$upper_age<0, 17,research_data$upper_age)
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="18-24" & research_data$lower_age<0, 18,research_data$lower_age)
research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="18-24" & research_data$upper_age<0, 24,research_data$upper_age)
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="25-44" & research_data$lower_age<0, 25,research_data$lower_age)
research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="25-44" & research_data$upper_age<0, 44,research_data$upper_age)
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="45-64" & research_data$lower_age<0, 45,research_data$lower_age)
research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="45-64" & research_data$upper_age<0, 64,research_data$upper_age)
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="65+" & research_data$lower_age<0, 65,research_data$lower_age)
research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="65+" & research_data$upper_age<0, 100,research_data$upper_age)
totalCases<-nrow(research_data)
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
The data that I have collected is the Year to date data of 2019 crimes commited in new york city. In my research question I want to see if there is a relationship between the crimes commited and age group of the suspect.
What are the cases, and how many are there?
The cases are a crime commited and categorized in one of three categories felony, misdemeanor or violation. There are a total of 216182 cases in this data set I will be studying.
Describe the method of data collection.
I have collected data from the New York City Public data set for the NYPD
Complaints that were filed Year to Date and analyze this data for the crimes
and the relationships with age.
What type of study is this (observational/experiment)?
This will be an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
I got the data from the New York City public safety website see link below.
https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i/data
What is the response variable? Is it quantitative or qualitative?
The response variable is the LAW_CAT_CD and it is qualititive variable as it describes the category of crime commited Felony, Misdemeanor or violation
You should have two independent variables, one quantitative and one qualitative.
The two independant variables will be gender of the suspect being qualitative and the second one will be the age of the suspect being quantitative.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
summary(research_data$SUSP_SEX)
## F M U
## 0 49725 162495 3962
Sex<-table(research_data$SUSP_SEX)
barplot(Sex)
summary(research_data$LAW_CAT_CD)
## FELONY MISDEMEANOR VIOLATION
## 59848 111733 44601
CrimeCategory<-table(research_data$LAW_CAT_CD)
barplot (CrimeCategory)
summary(research_data$lower_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 25.00 25.00 27.09 25.00 65.00
barplot(table(research_data$lower_age))
mean(research_data$lower_age)
## [1] 27.09262
suspects_data_set_Under_18<-subset(research_data,research_data$SUSP_AGE_GROUP=="<18" )
percentage_Of_Teens<-nrow(suspects_data_set_Under_18)/totalCases
percentage_Of_Teens*100
## [1] 5.590197
suspects_data_set_18_to_24<-subset(research_data,research_data$SUSP_AGE_GROUP=="18-24" )
percentage_Of_Young_adults<-nrow(suspects_data_set_18_to_24)/totalCases
percentage_Of_Young_adults*100
## [1] 18.06996
barplot(table(suspects_data_set_18_to_24$SUSP_SEX))
suspects_data_set_25_to_44<-subset(research_data,research_data$SUSP_AGE_GROUP=="25-44" )
percentage_Of_adults<-nrow(suspects_data_set_25_to_44)/totalCases
percentage_Of_adults*100
## [1] 54.47123
barplot(table(suspects_data_set_25_to_44$SUSP_SEX))
suspects_data_set_45_64<-subset(research_data,research_data$SUSP_AGE_GROUP=="45-64" )
percentage_Of_older_adults<-nrow(suspects_data_set_45_64)/totalCases
percentage_Of_older_adults*100
## [1] 19.96188
suspects_data_set_65_plus<-subset(research_data,research_data$SUSP_AGE_GROUP=="65+" )
percentage_Of_Senior<-nrow(suspects_data_set_65_plus)/totalCases
percentage_Of_Senior*100
## [1] 1.906727