## 
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics 
## This package is designed to support this course. The text book used 
## is OpenIntro Statistics, 3rd Edition. You can read this by typing 
## vignette('os3') or visit www.OpenIntro.org. 
##  
## The getLabs() function will return a list of the labs available. 
##  
## The demo(package='DATA606') will list the demos that are available.

Part 1 - Introduction

For this project, I decided to work with the NYC criminal complaints. after seeing the big dataset on the website NYC Open data I decided to just work with a sample and focusing on Manhattan and the year 2017. on this analysis, I want to see if they are a relation between the weather and crimes.

as research questions will be asking are there a relationship between crime rates and weather? does monthly average crime counting are the same or they vary?

This is a observational study

Part 2 - Data

Data details and Sources

The NYC open data offers access to NYPD dataset crimes report, an condense dataset which as last database update november 2018 has about 6.4 millions records and 35 columns. For this project i will be working with a sample subdata with the year 2017 and focusing in the occured events on Mahattahn.The sample subset will have 113782 records(obervations).

The data was retrive from NYC open Data https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i

and the weather data was retrived from National Oceanic and Atmospheric Administration (NOAA) National Weather Service Forecast Office, New York,NY https://w2.weather.gov/climate/xmacis.php?wfo=okx

Data Preparation

For the this project we noticed that the NYC crime complaints dataset has 35 variables but we will be selecting few of them to work with.

data <- read_csv("manhattan.csv")
## Warning: Missing column names filled in: 'X1' [1]
data.new <- data %>% select(CMPLNT_FR_DT,CMPLNT_FR_TM,OFNS_DESC,PD_DESC,LAW_CAT_CD,BORO_NM,PREM_TYP_DESC,Latitude,Longitude)

data.new$CMPLNT_FR_DT <-as.Date(as.character(data.new$CMPLNT_FR_DT),"%m/%d/%y")

count.date.crime <- data.new %>% group_by(CMPLNT_FR_DT) %>% summarise(total = n()) 
colnames(count.date.crime) = c("date","total")
 
weather.data <- read_csv("weather.data.csv")
weather <- weather.data[-365,]

Part 3 - Exploratory data analysis

crimes Complaints Graph

ggplot(count.date.crime, aes(x=total)) +
  geom_histogram(color="blue") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data.crime.type <- data.new %>% group_by(LAW_CAT_CD) %>% summarise(total =n())
head(data.crime.type)
## # A tibble: 3 x 2
##   LAW_CAT_CD  total
##   <chr>       <int>
## 1 FELONY      33998
## 2 MISDEMEANOR 64524
## 3 VIOLATION   13002
pl1 <- data.crime.type  %>%
          plot_ly(x=~LAW_CAT_CD , 
                  y = ~total , 
                  type = "bar",
                   xaxis = list(autotick = T, dtick = 1),
                 marker=list(color= ~total , size=20 , opacity=0.9) ) %>% layout(xaxis = list(title = "Type of Crime"),yaxis = list(title = "Number of Criminal Complaints"))

pl1

Weather Graph

hist(weather$avg)

pl1 <- weather  %>% group_by(month=floor_date(as.Date(as.character(date),"%m/%d/%Y"), "month")) %>%
          plot_ly(x=~month , 
                  y = ~avg , 
                  type = "bar",
                   xaxis = list(autotick = T, dtick = 1),
                 marker=list(color= ~avg , size=20 , opacity=0.9) ) %>% layout(xaxis = list(title = "Weather Min,Max,Avg"),yaxis = list(title = "Month"))

pl1

Part 4 - Inference

we reject the null

Linear regression

H0 - There is not a relation between crime complaints and the weather average.

H1 - There is a relation betwen crime complaints and the weather average.

ANOVA we reject the null

H0 - The means of all type crime is the same and doesnt vary.

Ha- The means for all the type crime indeed vary and are not the same.

Linear Regression

For the linear regression, I took the total daily count of criminal complaints and the average temperature daily. 1.lineal regression
2.summary
3.QQ- plot
4.plot

#checking the correlation between the variables
cor(count.date.crime$total, weather$avg)
## [1] 0.4893039
plot(count.date.crime$total, weather$avg )

m1 <- lm(count.date.crime$total ~ weather$avg )
summary(m1)
## 
## Call:
## lm(formula = count.date.crime$total ~ weather$avg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -137.110  -23.982    4.588   25.664  124.733 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 233.6378     7.1071   32.87   <2e-16 ***
## weather$avg   1.2870     0.1206   10.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.48 on 362 degrees of freedom
## Multiple R-squared:  0.2394, Adjusted R-squared:  0.2373 
## F-statistic:   114 on 1 and 362 DF,  p-value: < 2.2e-16
plot(m1)

qqnorm(m1$residuals)
qqline(m1$residuals)

plot(m1$residuals)

plot_ss(x= weather$avg , y=count.date.crime$total)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##     233.638        1.287  
## 
## Sum of Squares:  536102.2

Quadratic regression

After observing the summary of the linear regression I notice that small the R square value was and then I decide to do a Quadratic regression to see if I can get a better R square value.

weather.double <- weather$avg^2

#plot with the average weather
plot(count.date.crime$total,  weather$avg )

# plot with the sum of the avg weather and the square avg
plot(count.date.crime$total,  ( weather$avg + weather.double ))

m2 <- lm(count.date.crime$total ~  weather$avg + weather.double)
summary(m2)
## 
## Call:
## lm(formula = count.date.crime$total ~ weather$avg + weather.double)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -132.200  -23.294    2.863   24.479  119.129 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    152.77405   19.88631   7.682 1.49e-13 ***
## weather$avg      4.59812    0.77213   5.955 6.17e-09 ***
## weather.double  -0.03059    0.00705  -4.339 1.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.57 on 361 degrees of freedom
## Multiple R-squared:  0.2771, Adjusted R-squared:  0.2731 
## F-statistic: 69.19 on 2 and 361 DF,  p-value: < 2.2e-16
plot(m2)

# histogram of the residuals
hist(m2$residuals)

plot(m2$residuals)
abline(h = 0, lty = 1)

ANOVA

After doing a linear regression between weather and complainst crime data we noticed they were realtion of them i decide to work with a second analysis in the data which is working with ANOVA and see if they mean varied depending of the level of the crime complaints(Felony,Misdemeanor,Violotion).

count.type <- data.new %>% group_by(CMPLNT_FR_DT,LAW_CAT_CD) %>% summarise(total = n()) 

dat<-count.type[,2:3]
colnames(dat) = c("type","total")

df.anova <- as.data.frame(dat)
data.anova <- count.type[,2:3]
okk <- as.data.frame(data.anova)


ggplot(df.anova, aes(x=type, y=total,fill=type)) + 
    geom_boxplot()

anova.analysis <- aov(total ~ type,data = df.anova)

summary(anova.analysis)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## type           2 3687899 1843949    4902 <2e-16 ***
## Residuals   1089  409668     376                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Part 5 - Conclusion

The end of this analysis we can with some conclusion approach to our questions assumptions that we have at the beginning. the first analysis that we did was linear regression on which we can see there is a relationship between weather and crimes rates with a significant P value.

Even after observing Residuals values on plots looking that are normally distributed with a constants variability. after seeing a random pattern of the residuals that also support the linear regression. the only value that was small was the R square which leads to doing a quadratic regression to see if it get a better result on the R square. The quadratic result shows almost the same result just a little bit improve on the R square value.

But also we can see that they are a relation between the variables we still see some outer that may affect linearity on the graph so maybe there is better methods/model to work with this dataset.

The last test was ANOVA to calculate and see the mean comparing by the type of the offense and the daily count of them. The result shows a variation between the groups means.

We can reject both of Null Hypothesis after looking and the test result from the Linear and Quadratic regression and ANOVA.

I think It will be better to find a better model approach for future analysis and predition for this dataset.