Inital Steps

Packages Use

library(readxl) #for loading data
library(fitdistrplus) #for fitting mathematical curves

## Loading required package: MASS

## Loading required package: survival

library(LogisticCurveFitting) #for fitting logistic curve
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()

Loading the datasets

df = read_excel("YEAR 2002-21-2.xlsx")
df.total = df[37,]
df = df[-(37:40),]

Setting up a time series data

df.total = ts(as.numeric(unlist(df.total[-1])),start=2002)/1000
plot(df.total,main="Line Plot of Cyber-Crimes indt India (2002-2021)",ylab="No. of Crimes('000)",xlab="Year")

Fitting Mathematical Curves

t=0:19
#creating data frame for easy use
d.total = data.frame(t,df.total)
#linear growth model y(t) = a + bt
poly1 = lm(df.total~t,data=d.total)
lin.fit = poly1$coefficients
linear.growth.model = function(t) lin.fit[1] + lin.fit[2]*t
#quadratic growth model y(t) = a + bt + ct^2
poly2 = lm(df.total~t+I(t^2),data=d.total)
quad.fit = poly2$coefficients
quad.growth.model = function(t) {
  quad.fit[1] + quad.fit[2]*t +quad.fit[3]*t^2
}

#exponential growth model y(t) =  a*exp(rt)
est.lambda = 1/mean(df.total) #mle of lambda
exp.model = lm(log(df.total)~t,data=d.total)
exp.fit = exp.model$coefficients
exp.growth.model = function(t){
  exp(exp.fit[1] + exp.fit[2]*t)
} 
#gompertz growth model 


plot(t,df.total,main="Fitting Mathematical Curves to the Cyber Crime data",xlim = c(0,23),ylim=c(0,100),ylab="No. of Crimes(in '000)",xlab="Year",xaxt="n")
axis(1, at = seq(0,23,3), labels = seq(2002,2025,3))
curve(linear.growth.model,col="blue",add=TRUE)
curve(quad.growth.model,col="green",add=TRUE)
curve(exp.growth.model,col="red",add=TRUE)

summary(poly1)

## 
## Call:
## lm(formula = df.total ~ t, data = d.total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.279  -8.915  -2.246   7.381  16.888 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.3746     4.2387  -2.684   0.0152 *  
## t             2.4979     0.3814   6.549 3.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.836 on 18 degrees of freedom
## Multiple R-squared:  0.7044, Adjusted R-squared:  0.688 
## F-statistic: 42.89 on 1 and 18 DF,  p-value: 3.734e-06

summary(poly2)

## 
## Call:
## lm(formula = df.total ~ t + I(t^2), data = d.total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4684 -2.4011  0.0816  2.5538  6.7140 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.33420    2.25325   2.367     0.03 *  
## t           -3.07169    0.54969  -5.588 3.27e-05 ***
## I(t^2)       0.29314    0.02793  10.496 7.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.7 on 17 degrees of freedom
## Multiple R-squared:  0.9605, Adjusted R-squared:  0.9558 
## F-statistic: 206.6 on 2 and 17 DF,  p-value: 1.182e-12

summary(exp.model)

## 
## Call:
## lm(formula = log(df.total) ~ t, data = d.total)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93241 -0.28659 -0.01763  0.21574  1.40458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.61777    0.22490  -7.193 1.08e-06 ***
## t            0.29705    0.02024  14.678 1.85e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5219 on 18 degrees of freedom
## Multiple R-squared:  0.9229, Adjusted R-squared:  0.9186 
## F-statistic: 215.5 on 1 and 18 DF,  p-value: 1.851e-11

Zone Wise Comparison of Cyber Crimes

Defining the different zones

n.zone = sort(c("Haryana", "Himachal Pradesh", "Jammu & Kashmir", "Punjab", "Rajasthan",  "Delhi UT","Chandigarh"))
c.zone = sort(c("Uttar Pradesh", "Chhattisgarh", "Uttarakhand", "Madhya Pradesh")) #central zone
e.zone = sort(c("Bihar", "Jharkhand", "Odisha" , "West Bengal")) #eastern zone
ne.zone = sort(c("Sikkim","Tripura","Arunachal Pradesh","Assam","Mizoram","Manipur","Meghalaya","Nagaland"))
w.zone = sort(c("Goa", "Gujarat", "Maharashtra", "Daman & Diu", "D&N Haveli"))
s.zone = sort(c("Andhra Pradesh", "Karnataka", "Kerala", "Tamil Nadu", "Puducherry","A & N Islands","Lakshadweep","Telangana"))

Zone Wise data

states = unlist(df[1])
# writing the counts in percentage per 1000 
zonal.df = function(zone){
  cbind(year= 2002:2021, colSums(matrix(as.numeric(unlist(df[which(states %in% zone),][-1])),nrow=length(zone),byrow=FALSE))/c(df.total))
}

z.data = data.frame(n.crimes = round(rbind(north = zonal.df(n.zone),central = zonal.df(c.zone),east = zonal.df(e.zone),north.east = zonal.df(ne.zone),west = zonal.df(w.zone),
south = zonal.df(s.zone)),8),zone = rep(c("north","central","east","north-east","west","south"),rep(20,6)))

Using ggplot to draw the line plots

ggplot(z.data,aes(x=n.crimes.year,y=n.crimes.V2,col=zone))+
  geom_line()

Stochastic Time Series for the South Zone

setting up a time series data for the south zone with the pre-defined function zonal.df

south.df = ts(zonal.df(s.zone)[,2],start = 2002)
plot(south.df)

l1 = lag(c(south.df),k=1)[-1] - south.df[-1]
plot(l1,type="l")

l1 = lag(l1,k=1)[-1] - l1[-1]
plot(l1,type="l")

l1 = lag(l1,k=1)[-1] - l1[-1]
plot(l1,type="l")

Checking Stationarity (Unit Root Test, Dickey Fuller Test, Augmented Dickey Fuller Test)

Some ACF plots

acf(south.df,type="correlation",main="ACF Plot")

acf(south.df,type="covariance",main="AVCF Plot")

acf(south.df,type="partial",main="PACF Plot")

ARIMA (p= no. of autoregressive terms,d= no. of lags for stationarity,q = no. of moving average terms )

p = cut-off point of PACF d = lag such that the ts is stationary q = cut-off point of ACF

TIme Series Analysis for the Total Crimes Committed

AD

2023-04-18

Inital Steps

Packages Use

Loading the datasets

Setting up a time series data

Fitting Mathematical Curves

Zone Wise Comparison of Cyber Crimes

Defining the different zones

Zone Wise data

Using ggplot to draw the line plots

Stochastic Time Series for the South Zone

setting up a time series data for the south zone with the pre-defined function zonal.df

Checking Stationarity (Unit Root Test, Dickey Fuller Test, Augmented Dickey Fuller Test)

Some ACF plots

ARIMA (p= no. of autoregressive terms,d= no. of lags for stationarity,q = no. of moving average terms )