Required packages

Provide the packages required to reproduce the report. Make sure you fulfilled the minimum requirement #10.

# This is the R chunk for the required packages
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("editrules")
#install.packages("outliers")
#install.packages("forecast")

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(editrules)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## 
## Attaching package: 'editrules'
## The following objects are masked from 'package:igraph':
## 
##     blocks, normalize
## The following object is masked from 'package:dplyr':
## 
##     contains
## The following objects are masked from 'package:tidyr':
## 
##     contains, separate
library(outliers)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Executive Summary

Data

The data sets Crimes.csv and Suicide_rate.csv are both data sets about the statistics per State in India. These data sets can be found from Kaggle and are incredibly suited for data exploratory process. https://www.kaggle.com/beradabhishek/india-statewise-information?select=Crime.csv https://www.kaggle.com/beradabhishek/india-statewise-information?select=Sucide_Rate.csv

The Crimes dataset contains information and statistics about the crime rates, number of crimes per year for the years (2014, 2015, 2016) and the rankings based on these stats for each state in India. The dataset also contained figures about Cognizable crime which are set of serious crimes like terrorism, murder, extortion etc. All of this data is related to the 36 Indian States/Union Territories.

# This is the R chunk for the Data Section
crimes <- read.csv("Crime.csv", stringsAsFactors = FALSE)
suicide_rate <- read.csv("Suicide_Rate.csv", stringsAsFactors = FALSE)

#Dropping the Sno. columns from both dataframes
crimes$S.No <- NULL
suicide_rate$S.No <- NULL

#Changing the column names
colnames(crimes) <- c("States/UT","2014","2015","2016","Percentage_Share_of_State",
                      "Rank_based_on_incidence","Rate_of_cognizable_crime",
                      "Rank_based_on_crime_rate")
colnames(suicide_rate) <- c("States/UT","Suicide_rate_per_100000","Rank_based_on_suicide_rate")


#Checking the structures of both the frames
str(crimes)
## 'data.frame':    36 obs. of  8 variables:
##  $ States/UT                : chr  "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
##  $ 2014                     : int  114604 2843 94337 177595 58200 4466 131385 79947 14160 23848 ...
##  $ 2015                     : int  110693 2968 103616 176973 56692 3074 126935 84466 14007 23583 ...
##  $ 2016                     : int  106774 2534 102250 164163 55029 2692 147122 88527 13386 24501 ...
##  $ Percentage_Share_of_State: num  3.6 0.1 3.4 5.5 1.8 0.1 4.9 3 0.4 0.8 ...
##  $ Rank_based_on_incidence  : int  13 29 14 9 17 28 11 15 21 20 ...
##  $ Rate_of_cognizable_crime : num  206 192 314 157 212 ...
##  $ Rank_based_on_crime_rate : int  15 17 5 22 14 25 11 4 20 16 ...
str(suicide_rate)
## 'data.frame':    36 obs. of  3 variables:
##  $ States/UT                 : chr  "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
##  $ Suicide_rate_per_100000   : num  12.1 10.4 10 0.5 27.7 15.4 11.6 13 7.7 3 ...
##  $ Rank_based_on_suicide_rate: int  16 20 21 36 4 12 19 15 24 31 ...
#Merging the dataframes using the left_join funciton, 
#as it will help us keep all the cases for the crimes dataframe. Although both have identical keys
main_df <- crimes%>%left_join(suicide_rate, by = "States/UT")
head(main_df)

Understand

Once we have merged the Data sets together, it was essential to see what kind of data was imported as what data type. This step is critical to check the data consistency in any data set.

# This is the R chunk for the Understand Section
# Converting State columns in both dataframes into factors.
# We should also make the 3 Rank columns into factors, since they are unique values.

str(main_df)
## 'data.frame':    36 obs. of  10 variables:
##  $ States/UT                 : chr  "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
##  $ 2014                      : int  114604 2843 94337 177595 58200 4466 131385 79947 14160 23848 ...
##  $ 2015                      : int  110693 2968 103616 176973 56692 3074 126935 84466 14007 23583 ...
##  $ 2016                      : int  106774 2534 102250 164163 55029 2692 147122 88527 13386 24501 ...
##  $ Percentage_Share_of_State : num  3.6 0.1 3.4 5.5 1.8 0.1 4.9 3 0.4 0.8 ...
##  $ Rank_based_on_incidence   : int  13 29 14 9 17 28 11 15 21 20 ...
##  $ Rate_of_cognizable_crime  : num  206 192 314 157 212 ...
##  $ Rank_based_on_crime_rate  : int  15 17 5 22 14 25 11 4 20 16 ...
##  $ Suicide_rate_per_100000   : num  12.1 10.4 10 0.5 27.7 15.4 11.6 13 7.7 3 ...
##  $ Rank_based_on_suicide_rate: int  16 20 21 36 4 12 19 15 24 31 ...
main_df$`States/UT` <- as.factor(main_df$`States/UT`)
levels(main_df$`States/UT`)
##  [1] "Andaman and Nicobar Islands" "Andhra Pradesh"             
##  [3] "Arunachal Pradesh"           "Assam"                      
##  [5] "Bihar"                       "Chandigarh"                 
##  [7] "Chhattisgarh"                "Dadra and Nagar Haveli"     
##  [9] "Daman and Diu"               "Delhi"                      
## [11] "Goa"                         "Gujarat"                    
## [13] "Haryana"                     "Himachal Pradesh"           
## [15] "Jammu and Kashmir"           "Jharkhand"                  
## [17] "Karnataka"                   "Kerala"                     
## [19] "Lakshwadeep"                 "Madhya Pradesh"             
## [21] "Maharashtra"                 "Manipur"                    
## [23] "Meghalaya"                   "Mizoram"                    
## [25] "Nagaland"                    "Odisha"                     
## [27] "Puducherry"                  "Punjab"                     
## [29] "Rajasthan"                   "Sikkim"                     
## [31] "Tamil Nadu"                  "Telangana"                  
## [33] "Tripura"                     "Uttar Pradesh"              
## [35] "Uttarakhand"                 "West Bengal"
main_df$Rank_based_on_incidence <- as.character(main_df$Rank_based_on_incidence)
main_df$Rank_based_on_incidence <- factor(main_df$Rank_based_on_incidence,levels = c ("1" , "2", "3"  ,"4"  ,"5"  ,"6"  ,"7"  ,"8"  ,"9"  ,"10" ,"11" ,"12", "13" ,"14" ,"15" ,"16" ,"17" ,"18", "19", "20", "21", "22", "23" ,"24" ,"25" ,"26" ,"27" ,"28" ,"29" ,"30" ,"31" ,"32" ,"33" ,"34", "35" ,"36"), ordered = TRUE)
levels(main_df$Rank_based_on_incidence)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36"
main_df$Rank_based_on_crime_rate <- as.character(main_df$Rank_based_on_crime_rate)
main_df$Rank_based_on_crime_rate <- factor(main_df$Rank_based_on_crime_rate,levels = c ("1" , "2", "3"  ,"4"  ,"5"  ,"6"  ,"7"  ,"8"  ,"9"  ,"10" ,"11" ,"12", "13" ,"14" ,"15" ,"16" ,"17" ,"18", "19", "20", "21", "22", "23" ,"24" ,"25" ,"26" ,"27" ,"28" ,"29" ,"30" ,"31" ,"32" ,"33" ,"34", "35" ,"36"), ordered = TRUE)
levels(main_df$Rank_based_on_crime_rate)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36"
main_df$Rank_based_on_suicide_rate <- as.character(main_df$Rank_based_on_suicide_rate)
main_df$Rank_based_on_suicide_rate <- factor(main_df$Rank_based_on_suicide_rate,levels = c ("1" , "2", "3"  ,"4"  ,"5"  ,"6"  ,"7"  ,"8"  ,"9"  ,"10" ,"11" ,"12", "13" ,"14" ,"15" ,"16" ,"17" ,"18", "19", "20", "21", "22", "23" ,"24" ,"25" ,"26" ,"27" ,"28" ,"29" ,"30" ,"31" ,"32" ,"33" ,"34", "35" ,"36"), ordered = TRUE)
levels(main_df$Rank_based_on_suicide_rate)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36"

Tidy & Manipulate Data I

Explain why your data (or one of the data sets) doesn’t conform the tidy data principles (minimum requirement #5). Apply the required steps to reshape the data into a tidy format. In addition to the R codes and outputs, explain everything that you do in this step.

# This is the R chunk for the Tidy & Manipulate Data I 
#Tidying the Data Frame
tidy_df <- main_df %>%gather(`2014`, `2015`, `2016`, key = "Year_for_crime_data", value = "Crime_Cases")

#Converting the newly found Year column into factor as we did with the States/UT column
tidy_df$Year_for_crime_data <- as.factor(tidy_df$Year_for_crime_data) 

#To increase readability
tidy_df <- tidy_df %>% select(`States/UT`,Year_for_crime_data, Crime_Cases, everything())


#Arranging the dataframe by the states
tidy_df <- tidy_df%>%arrange(`States/UT`)
head(tidy_df)
str(tidy_df)
## 'data.frame':    108 obs. of  9 variables:
##  $ States/UT                 : Factor w/ 36 levels "Andaman and Nicobar Islands",..: 1 1 1 2 2 2 3 3 3 4 ...
##  $ Year_for_crime_data       : Factor w/ 3 levels "2014","2015",..: 1 2 3 1 2 3 1 2 3 1 ...
##  $ Crime_Cases               : int  746 862 802 114604 110693 106774 2843 2968 2534 94337 ...
##  $ Percentage_Share_of_State : num  0 0 0 3.6 3.6 3.6 0.1 0.1 0.1 3.4 ...
##  $ Rank_based_on_incidence   : Ord.factor w/ 36 levels "1"<"2"<"3"<"4"<..: 33 33 33 13 13 13 29 29 29 14 ...
##  $ Rate_of_cognizable_crime  : num  145 145 145 206 206 ...
##  $ Rank_based_on_crime_rate  : Ord.factor w/ 36 levels "1"<"2"<"3"<"4"<..: 23 23 23 15 15 15 17 17 17 5 ...
##  $ Suicide_rate_per_100000   : num  28.9 28.9 28.9 12.1 12.1 12.1 10.4 10.4 10.4 10 ...
##  $ Rank_based_on_suicide_rate: Ord.factor w/ 36 levels "1"<"2"<"3"<"4"<..: 3 3 3 16 16 16 20 20 20 21 ...

Tidy & Manipulate Data II

For the next part of Tidy and manipulation, I thought of mutating a new column into the existing dataframe. The ‘Rate_of_cognizable_crime’ column contained the number of Cognizable crimes per 100000 people in any particular State. I used this column and the ‘Crime_cases’ column to find out approximately how many Cognizable crimes could be registered in any particular State. There is no exact number of Cognizable crime count provided, so this was just a way to find out how may cases have been recorded approximately according to the rate of Cognizable crime.

For this I used the Rate_of_cognizable_crime column and divided it by 100000, and subsequentally multiplying this result by the number of crime cases.

# This is the R chunk for the Tidy & Manipulate Data II 
tidy_df %>% mutate(Approx_no_of_cognizable_crimes = (Rate_of_cognizable_crime/100000) * Crime_Cases)

Scan I

Scanning the data for missing values, special values and obvious errors (i.e. inconsistencies) included scanning the dataframe by the following ways.

# This is the R chunk for the Scan I
#Scanning the datframe for na values
#Since we can miss a few missing values here and there in a big dataframe, 
#it is better to get a count of missing values
colSums(is.na(tidy_df))
##                  States/UT        Year_for_crime_data 
##                          0                          0 
##                Crime_Cases  Percentage_Share_of_State 
##                          0                          0 
##    Rank_based_on_incidence   Rate_of_cognizable_crime 
##                          0                          0 
##   Rank_based_on_crime_rate    Suicide_rate_per_100000 
##                          0                          0 
## Rank_based_on_suicide_rate 
##                          0
#Once we have checked for missing values, it is time to check for infinite or nan values

#For nan values
colSums(sapply(tidy_df, is.nan))
##                  States/UT        Year_for_crime_data 
##                          0                          0 
##                Crime_Cases  Percentage_Share_of_State 
##                          0                          0 
##    Rank_based_on_incidence   Rate_of_cognizable_crime 
##                          0                          0 
##   Rank_based_on_crime_rate    Suicide_rate_per_100000 
##                          0                          0 
## Rank_based_on_suicide_rate 
##                          0
#For infinite values
colSums(sapply(tidy_df, is.infinite))
##                  States/UT        Year_for_crime_data 
##                          0                          0 
##                Crime_Cases  Percentage_Share_of_State 
##                          0                          0 
##    Rank_based_on_incidence   Rate_of_cognizable_crime 
##                          0                          0 
##   Rank_based_on_crime_rate    Suicide_rate_per_100000 
##                          0                          0 
## Rank_based_on_suicide_rate 
##                          0
#There are times when a particular column may have some obvious inconsistencies
#To check for that I will make use of the editrules library
(Rule1 <- editset(c("Percentage_Share_of_State >= 0", "Rate_of_cognizable_crime >= 0", 
                    "Suicide_rate_per_100000 >= 0", "Crime_Cases >= 0")))
## 
## Edit set:
## num1 : 0 <= Percentage_Share_of_State
## num2 : 0 <= Rate_of_cognizable_crime
## num3 : 0 <= Suicide_rate_per_100000
## num4 : 0 <= Crime_Cases
colSums(violatedEdits(Rule1, tidy_df))
## num1 num2 num3 num4 
##    0    0    0    0

Scan II

Scan the numeric data for outliers.

# This is the R chunk for the Scan II
#Searching for outliers in the Percentage_Share_of_State column
tidy_df$Percentage_Share_of_State%>%boxplot(main = "Box Plot for Percentage share of crime per state",
                                               ylab = "Percentage of crime", col = "grey")

z.scores_per_share <- tidy_df$Percentage_Share_of_State%>%scores(type = "z")
z.scores_per_share%>%summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7819 -0.7583 -0.4641  0.0000  0.5361  2.9836
length (which( abs(z.scores_per_share) >3 ))
## [1] 0
#Searching for outliers in Rate_of_cognizable_crime column
tidy_df$Rate_of_cognizable_crime %>% boxplot(main = "Box Plot for Rate of cognizable crime",
                                             ylab = "Percentage of cognizable crime", col = "grey")

z.scores_cog_crimes <- tidy_df$Rate_of_cognizable_crime%>%scores(type = "z")
z.scores_cog_crimes%>%summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.9955 -0.5356 -0.1564  0.0000  0.1247  4.3606
length (which( abs(z.scores_cog_crimes) >3 ))
## [1] 3
#Searching for outliers in Suicide_rate_per_100000  column
tidy_df$Suicide_rate_per_100000 %>% boxplot(main = "Box Plot for Suicide Rate per 100000",
                                             ylab = "Suicide rate per 100000", col = "grey")

z.scores_suicide_rate <- tidy_df$Suicide_rate_per_100000%>%scores(type = "z")
z.scores_suicide_rate%>%summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2419 -0.7237 -0.1620  0.0000  0.4482  2.8938
length (which( abs(z.scores_suicide_rate) >3 ))
## [1] 0

Transform

Transforming the column Suicide_rate_per_100000 as it had a few outliers as seen in the last section.

# This is the R chunk for the Transform Section

#Transforming the column Suicide_rate_per_100000 as it had a few outliers
hist(tidy_df$Suicide_rate_per_100000)

* I used the mathematical process of taking the log10 of each value of the column. As it turned out this made the values left skewed from being right skewed.

#Using log wwith base 10 transformation, we actually find out that the histogram actually flips to be left skewed
log_suicide_rate <- log10(tidy_df$Suicide_rate_per_100000)
hist(log_suicide_rate)

#Square transform makes little to no impact as data remains right skewed
square_suicide_rate <- tidy_df$Suicide_rate_per_100000^2
hist(square_suicide_rate)

#BoxCox transformation actually makes the distribution most centralized
boxcox_suicide_rate<- BoxCox(tidy_df$Suicide_rate_per_100000,lambda = "auto")
hist(boxcox_suicide_rate)