Data Preparation

# load data
library(readr)
options(scipen = 999)  
df<-data.frame(read.csv("NYPD_Complaint_Data_Current__Year_To_Date_.csv"))
head(df)
##   CMPLNT_NUM ADDR_PCT_CD   BORO_NM CMPLNT_FR_DT CMPLNT_FR_TM CMPLNT_TO_DT
## 1  314773184          48     BRONX   12/31/2019     18:00:00             
## 2  289837961          25 MANHATTAN   12/30/2019     20:30:00   12/31/2019
## 3  535744284          77  BROOKLYN   12/24/2019     16:55:00   12/24/2019
## 4  895678119          52     BRONX   12/30/2019     19:32:00             
## 5  299841674          18 MANHATTAN   12/30/2019     15:30:00   12/30/2019
## 6  136697381          94  BROOKLYN   12/28/2019     13:00:00   12/29/2019
##   CMPLNT_TO_TM CRM_ATPT_CPTD_CD HADEVELOPT HOUSING_PSA JURISDICTION_CODE
## 1                     COMPLETED                     NA                 0
## 2     10:00:00        COMPLETED                     NA                 0
## 3     17:00:00        COMPLETED                     NA                 0
## 4                     COMPLETED                     NA                 0
## 5     16:50:00        COMPLETED                     NA                 0
## 6      8:30:00        COMPLETED                     NA                 0
##         JURIS_DESC KY_CD  LAW_CAT_CD LOC_OF_OCCUR_DESC      OFNS_DESC PARKS_NM
## 1 N.Y. POLICE DEPT   105      FELONY                          ROBBERY         
## 2 N.Y. POLICE DEPT   341 MISDEMEANOR            INSIDE  PETIT LARCENY         
## 3 N.Y. POLICE DEPT   106      FELONY                   FELONY ASSAULT         
## 4 N.Y. POLICE DEPT   341 MISDEMEANOR            INSIDE  PETIT LARCENY         
## 5 N.Y. POLICE DEPT   341 MISDEMEANOR                    PETIT LARCENY         
## 6 N.Y. POLICE DEPT   341 MISDEMEANOR            INSIDE  PETIT LARCENY         
##               PATROL_BORO PD_CD                            PD_DESC
## 1       PATROL BORO BRONX   386 ROBBERY,PERSONAL ELECTRONIC DEVICE
## 2   PATROL BORO MAN NORTH   338     LARCENY,PETIT FROM BUILDING,UN
## 3 PATROL BORO BKLYN NORTH   109           ASSAULT 2,1,UNCLASSIFIED
## 4       PATROL BORO BRONX   333     LARCENY,PETIT FROM STORE-SHOPL
## 5   PATROL BORO MAN SOUTH   301     LARCENY,PETIT BY ACQUIRING LOS
## 6 PATROL BORO BKLYN NORTH   349     LARCENY,PETIT OF LICENSE PLATE
##            PREM_TYP_DESC     RPT_DT STATION_NAME SUSP_AGE_GROUP SUSP_RACE
## 1                 STREET 12/31/2019                     UNKNOWN   UNKNOWN
## 2 RESIDENCE - APT. HOUSE 12/31/2019                                      
## 3                 STREET 12/31/2019                     UNKNOWN     BLACK
## 4            CHAIN STORE 12/30/2019                       25-44     WHITE
## 5                 STREET 12/30/2019                                      
## 6                 STREET 12/30/2019                                      
##   SUSP_SEX TRANSIT_DISTRICT VIC_AGE_GROUP       VIC_RACE VIC_SEX X_COORD_CD
## 1        U               NA         45-64 WHITE HISPANIC       F    1016990
## 2                        NA         25-44 WHITE HISPANIC       F     999296
## 3        M               NA         18-24          WHITE       M    1001068
## 4        F               NA       UNKNOWN        UNKNOWN       D    1009690
## 5                        NA         25-44 WHITE HISPANIC       M     985766
## 6                        NA       UNKNOWN        UNKNOWN       D     997964
##   Y_COORD_CD Latitude Longitude                                  Lat_Lon
## 1     244612 40.83803 -73.88168 (40.838026269000075, -73.88168118799997)
## 2     230862 40.80033 -73.94566 (40.800334261000046, -73.94565697199994)
## 3     183317 40.66983 -73.93938  (40.66983179600004, -73.93937555099996)
## 4     257590 40.87367 -73.90801  (40.87367103500002, -73.90801364899994)
## 5     216503 40.76094 -73.99453  (40.76093528000007, -73.99452906599998)
## 6     205703 40.73128 -73.95052  (40.73128148300003, -73.95051933399996)
names(df)
##  [1] "CMPLNT_NUM"        "ADDR_PCT_CD"       "BORO_NM"          
##  [4] "CMPLNT_FR_DT"      "CMPLNT_FR_TM"      "CMPLNT_TO_DT"     
##  [7] "CMPLNT_TO_TM"      "CRM_ATPT_CPTD_CD"  "HADEVELOPT"       
## [10] "HOUSING_PSA"       "JURISDICTION_CODE" "JURIS_DESC"       
## [13] "KY_CD"             "LAW_CAT_CD"        "LOC_OF_OCCUR_DESC"
## [16] "OFNS_DESC"         "PARKS_NM"          "PATROL_BORO"      
## [19] "PD_CD"             "PD_DESC"           "PREM_TYP_DESC"    
## [22] "RPT_DT"            "STATION_NAME"      "SUSP_AGE_GROUP"   
## [25] "SUSP_RACE"         "SUSP_SEX"          "TRANSIT_DISTRICT" 
## [28] "VIC_AGE_GROUP"     "VIC_RACE"          "VIC_SEX"          
## [31] "X_COORD_CD"        "Y_COORD_CD"        "Latitude"         
## [34] "Longitude"         "Lat_Lon"
The data set loaded has all cases to date as of 12/31/2019. Since data has essentially five age ranges known 
        <18 , 
        18-24,
        25-44,
        45-64 
        and 65+ 
and remaining are unknown age groups I will only filter out the known age groups 
research_data<-subset(df,df$SUSP_AGE_GROUP=="<18" 
                      | df$SUSP_AGE_GROUP=="18-24"
                      |df$SUSP_AGE_GROUP=="25-44"
                      |df$SUSP_AGE_GROUP=="45-64"
                      |df$SUSP_AGE_GROUP=="65+"
                      )
research_data$lower_age<- -1
research_data$upper_age<- -1
research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="<18" & research_data$lower_age<0, 0,research_data$lower_age)

research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="<18" & research_data$upper_age<0, 17,research_data$upper_age)


research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="18-24" & research_data$lower_age<0, 18,research_data$lower_age)

research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="18-24" & research_data$upper_age<0, 24,research_data$upper_age)

research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="25-44" & research_data$lower_age<0, 25,research_data$lower_age)

research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="25-44" & research_data$upper_age<0, 44,research_data$upper_age)

research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="45-64" & research_data$lower_age<0, 45,research_data$lower_age)

research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="45-64" & research_data$upper_age<0, 64,research_data$upper_age)


research_data$lower_age <- ifelse(research_data$SUSP_AGE_GROUP=="65+" & research_data$lower_age<0, 65,research_data$lower_age)

research_data$upper_age <- ifelse(research_data$SUSP_AGE_GROUP=="65+" & research_data$upper_age<0, 100,research_data$upper_age)

totalCases<-nrow(research_data)

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

The data that I have collected is the Year to date data of 2019 crimes commited in new york city. In my research question I want to see if there is a relationship between the crimes commited and age group of the suspect.

Cases

What are the cases, and how many are there?

The cases are a crime commited and categorized in one of three categories felony, misdemeanor or violation. There are a total of 216182 cases in this data set I will be studying.

Data collection

Describe the method of data collection.

I have collected data from the New York City Public data set for the NYPD
Complaints that were filed Year to Date and analyze this data for the crimes 
and the relationships with age.

Type of study

What type of study is this (observational/experiment)?

  This will be an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

 I got the data from the New York City public safety website see link below.
 
 https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i/data

Dependent Variable

What is the response variable? Is it quantitative or qualitative?

The response variable is the LAW_CAT_CD and it is qualititive variable as it describes the category of crime commited Felony, Misdemeanor or violation

Independent Variable

You should have two independent variables, one quantitative and one qualitative.

The two independant variables will be gender of the suspect being qualitative and the second one will be the age of the suspect being quantitative.

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

summary(research_data$SUSP_SEX)
##             F      M      U 
##      0  49725 162495   3962
Sex<-table(research_data$SUSP_SEX)

barplot(Sex)

summary(research_data$LAW_CAT_CD)
##      FELONY MISDEMEANOR   VIOLATION 
##       59848      111733       44601
CrimeCategory<-table(research_data$LAW_CAT_CD)
barplot (CrimeCategory)

summary(research_data$lower_age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   25.00   25.00   27.09   25.00   65.00
barplot(table(research_data$lower_age))

mean(research_data$lower_age)
## [1] 27.09262
suspects_data_set_Under_18<-subset(research_data,research_data$SUSP_AGE_GROUP=="<18" )

percentage_Of_Teens<-nrow(suspects_data_set_Under_18)/totalCases
percentage_Of_Teens*100
## [1] 5.590197
suspects_data_set_18_to_24<-subset(research_data,research_data$SUSP_AGE_GROUP=="18-24" )

percentage_Of_Young_adults<-nrow(suspects_data_set_18_to_24)/totalCases
percentage_Of_Young_adults*100
## [1] 18.06996
barplot(table(suspects_data_set_18_to_24$SUSP_SEX))

suspects_data_set_25_to_44<-subset(research_data,research_data$SUSP_AGE_GROUP=="25-44" )
percentage_Of_adults<-nrow(suspects_data_set_25_to_44)/totalCases
percentage_Of_adults*100
## [1] 54.47123
barplot(table(suspects_data_set_25_to_44$SUSP_SEX))

suspects_data_set_45_64<-subset(research_data,research_data$SUSP_AGE_GROUP=="45-64" )
percentage_Of_older_adults<-nrow(suspects_data_set_45_64)/totalCases
percentage_Of_older_adults*100
## [1] 19.96188
suspects_data_set_65_plus<-subset(research_data,research_data$SUSP_AGE_GROUP=="65+" )
percentage_Of_Senior<-nrow(suspects_data_set_65_plus)/totalCases
percentage_Of_Senior*100 
## [1] 1.906727