Inital Steps

Packages Use

library(readxl) #for loading data
library(fitdistrplus) #for fitting mathematical curves
## Loading required package: MASS
## Loading required package: survival
library(LogisticCurveFitting) #for fitting logistic curve
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()

Loading the datasets

df = read_excel("YEAR 2002-21-2.xlsx")
df.total = df[37,]
df = df[-(37:40),]

Setting up a time series data

df.total = ts(as.numeric(unlist(df.total[-1])),start=2002)/1000
plot(df.total,main="Line Plot of Cyber-Crimes indt India (2002-2021)",ylab="No. of Crimes('000)",xlab="Year")

Fitting Mathematical Curves

t=0:19
#creating data frame for easy use
d.total = data.frame(t,df.total)
#linear growth model y(t) = a + bt
poly1 = lm(df.total~t,data=d.total)
lin.fit = poly1$coefficients
linear.growth.model = function(t) lin.fit[1] + lin.fit[2]*t
#quadratic growth model y(t) = a + bt + ct^2
poly2 = lm(df.total~t+I(t^2),data=d.total)
quad.fit = poly2$coefficients
quad.growth.model = function(t) {
  quad.fit[1] + quad.fit[2]*t +quad.fit[3]*t^2
}

#exponential growth model y(t) =  a*exp(rt)
est.lambda = 1/mean(df.total) #mle of lambda
exp.model = lm(log(df.total)~t,data=d.total)
exp.fit = exp.model$coefficients
exp.growth.model = function(t){
  exp(exp.fit[1] + exp.fit[2]*t)
} 
#gompertz growth model 


plot(t,df.total,main="Fitting Mathematical Curves to the Cyber Crime data",xlim = c(0,23),ylim=c(0,100),ylab="No. of Crimes(in '000)",xlab="Year",xaxt="n")
axis(1, at = seq(0,23,3), labels = seq(2002,2025,3))
curve(linear.growth.model,col="blue",add=TRUE)
curve(quad.growth.model,col="green",add=TRUE)
curve(exp.growth.model,col="red",add=TRUE)

summary(poly1)
## 
## Call:
## lm(formula = df.total ~ t, data = d.total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.279  -8.915  -2.246   7.381  16.888 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.3746     4.2387  -2.684   0.0152 *  
## t             2.4979     0.3814   6.549 3.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.836 on 18 degrees of freedom
## Multiple R-squared:  0.7044, Adjusted R-squared:  0.688 
## F-statistic: 42.89 on 1 and 18 DF,  p-value: 3.734e-06
summary(poly2)
## 
## Call:
## lm(formula = df.total ~ t + I(t^2), data = d.total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4684 -2.4011  0.0816  2.5538  6.7140 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.33420    2.25325   2.367     0.03 *  
## t           -3.07169    0.54969  -5.588 3.27e-05 ***
## I(t^2)       0.29314    0.02793  10.496 7.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.7 on 17 degrees of freedom
## Multiple R-squared:  0.9605, Adjusted R-squared:  0.9558 
## F-statistic: 206.6 on 2 and 17 DF,  p-value: 1.182e-12
summary(exp.model)
## 
## Call:
## lm(formula = log(df.total) ~ t, data = d.total)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93241 -0.28659 -0.01763  0.21574  1.40458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.61777    0.22490  -7.193 1.08e-06 ***
## t            0.29705    0.02024  14.678 1.85e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5219 on 18 degrees of freedom
## Multiple R-squared:  0.9229, Adjusted R-squared:  0.9186 
## F-statistic: 215.5 on 1 and 18 DF,  p-value: 1.851e-11

Zone Wise Comparison of Cyber Crimes

Defining the different zones

n.zone = sort(c("Haryana", "Himachal Pradesh", "Jammu & Kashmir", "Punjab", "Rajasthan",  "Delhi UT","Chandigarh"))
c.zone = sort(c("Uttar Pradesh", "Chhattisgarh", "Uttarakhand", "Madhya Pradesh")) #central zone
e.zone = sort(c("Bihar", "Jharkhand", "Odisha" , "West Bengal")) #eastern zone
ne.zone = sort(c("Sikkim","Tripura","Arunachal Pradesh","Assam","Mizoram","Manipur","Meghalaya","Nagaland"))
w.zone = sort(c("Goa", "Gujarat", "Maharashtra", "Daman & Diu", "D&N Haveli"))
s.zone = sort(c("Andhra Pradesh", "Karnataka", "Kerala", "Tamil Nadu", "Puducherry","A & N Islands","Lakshadweep","Telangana"))

Zone Wise data

states = unlist(df[1])
# writing the counts in percentage per 1000 
zonal.df = function(zone){
  cbind(year= 2002:2021, colSums(matrix(as.numeric(unlist(df[which(states %in% zone),][-1])),nrow=length(zone),byrow=FALSE))/c(df.total))
}

z.data = data.frame(n.crimes = round(rbind(north = zonal.df(n.zone),central = zonal.df(c.zone),east = zonal.df(e.zone),north.east = zonal.df(ne.zone),west = zonal.df(w.zone),
south = zonal.df(s.zone)),8),zone = rep(c("north","central","east","north-east","west","south"),rep(20,6)))

Using ggplot to draw the line plots

ggplot(z.data,aes(x=n.crimes.year,y=n.crimes.V2,col=zone))+
  geom_line()

Stochastic Time Series for the South Zone

setting up a time series data for the south zone with the pre-defined function zonal.df

south.df = ts(zonal.df(s.zone)[,2],start = 2002)
plot(south.df)

l1 = lag(c(south.df),k=1)[-1] - south.df[-1]
plot(l1,type="l")

l1 = lag(l1,k=1)[-1] - l1[-1]
plot(l1,type="l")

l1 = lag(l1,k=1)[-1] - l1[-1]
plot(l1,type="l")

Checking Stationarity (Unit Root Test, Dickey Fuller Test, Augmented Dickey Fuller Test)

Some ACF plots

acf(south.df,type="correlation",main="ACF Plot")

acf(south.df,type="covariance",main="AVCF Plot")

acf(south.df,type="partial",main="PACF Plot")

ARIMA (p= no. of autoregressive terms,d= no. of lags for stationarity,q = no. of moving average terms )

p = cut-off point of PACF d = lag such that the ts is stationary q = cut-off point of ACF