Required packages

library(readxl)
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.4
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.4
library(outliers)

Executive Summary

Data is imported from web source Kaggle and preprocessed for analysis.Following steps done for preprocessing.
- Importing the datset from its location and checking its structure, sample values and null values
- Filtering the desired data subset rows using filter condition and then selecting only the desired columns
- Variables are then analyzed for their types and are converted to factor and date type where applicable, factor variables are labelled and ordered
- The dataset are tidy-uped and converted into same format so that they can be merged by joining them together logically using left join on district and year column, In this case the second dataset does not follow tidy pricipals so have used gather and spread functions to tidy.
- The final dataset follows tidy data principles as every variable has its own column, each district has its own row of data and each value has its own cell
- The final dataset has been used to create new mutated variables using existing variables as we want to analyze the ratios
- The final dataset has been scanned for missing values and it had missing values because values were missing in original dataset as well
- Missing values are imputed with mean values of all the districts
- The dataset is then checked for outliers by plotting histogram for all the individual variables
- The outliers which lie outside the lower limit of the outlier fence on the box plot are capped with the value of 5th percentile and those that lie above the upper limit are capped with 95th percentile value
- Histograms are again plotted for individual variables to check if capping is done successfully
- Plotting histogram of variable Population is observed to be right skewed to decrease the skewness of this variable it is log tranformed to fit a normal distribution

Data

Two dataset have been taken for analysis from https://www.kaggle.com/webaccess/all-census-data/data. The data captures census and GDP Data for India. We have considered only Data for Rajasthan state for the year 2011. We are concerned with Population, literacy and GDP related metrics for male, female and the total population. Dataset 1 has census data for different Indian states for the year 2011. It has columns for district code , district name,column for date with column name is year, state name and various variables depicting census, literacy and education related variables. Data for Rajasthan state with only desired metrics has been filtered. The second data set is specifc for Rajasthan state and captures GDP and Growth rate for different districts of Rajasthan.This data is not in tidy format. It has Year Column which captures date information, Discription column is a factor variable with GDP and growth rate. The variables do not have their own columns and district name is also not rowwise. This dataset is not in tidy format.

Data Source

#Dataset 1
setwd("K:/Masters of Analytics/Data Preprocessing/Assignment 3/all-census-data") # Changing working directory to desired location
india_districts_census_2011<-read.csv("india-districts-census-2011.csv",stringsAsFactors = FALSE) #Importing the csv file
str(india_districts_census_2011) #Datatset structure
## 'data.frame':    640 obs. of  22 variables:
##  $ District.code       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year                : chr  "1-1-2011" "1-1-2011" "1-1-2011" "1-1-2011" ...
##  $ State.Name          : chr  "JAMMU AND KASHMIR" "JAMMU AND KASHMIR" "JAMMU AND KASHMIR" "JAMMU AND KASHMIR" ...
##  $ District.name       : chr  "Kupwara" "Badgam" "Leh(Ladakh)" "Kargil" ...
##  $ Population          : int  870354 753745 133487 140802 476835 642415 616435 1008039 392232 1236829 ...
##  $ Male                : int  474190 398041 78971 77785 251899 345351 326109 534733 207680 651124 ...
##  $ Female              : int  396164 355704 54516 63017 224936 297064 290326 473306 184552 585705 ...
##  $ Literate            : int  439654 335649 93770 86236 261724 364109 389204 545149 185979 748584 ...
##  $ Male_Literate       : int  282823 207741 62834 56301 163333 224469 228499 337170 117058 431746 ...
##  $ Female_Literate     : int  156831 127908 30936 29935 98391 139640 160705 207979 68921 316838 ...
##  $ Male_ST             : int  36913 12383 47543 62652 90274 121374 27693 20237 39398 5021 ...
##  $ Female_ST           : int  33439 11529 48314 59684 85827 111441 25614 17468 35976 3914 ...
##  $ Workers             : int  229064 214866 75079 51873 161393 290912 200431 304200 149317 407188 ...
##  $ Male_Workers        : int  190899 162578 53265 39839 117677 184752 161548 249581 101380 333151 ...
##  $ Female_Workers      : int  38165 52288 21814 12034 43716 106160 38883 54619 47937 74037 ...
##  $ Main_Workers        : int  123837 132003 57125 28941 73247 130377 142847 187959 75173 330547 ...
##  $ Marginal_Workers    : int  105227 82863 17954 22932 88146 160535 57584 116241 74144 76641 ...
##  $ Literate_Education  : int  405015 374972 75493 82938 288430 394977 426572 555050 199416 904049 ...
##  $ Illiterate_Education: int  289765 342646 32637 39854 164044 242306 181297 350119 163648 310225 ...
##  $ Total_Education     : int  694780 717618 108130 122792 452474 637283 607869 905169 363064 1214274 ...
##  $ Age_Group_0_29      : int  600759 503223 70703 87532 304979 404903 357864 636524 252378 693238 ...
##  $ Age_Group_30_49     : int  178435 160933 41515 35561 109818 153165 160123 239659 90465 351561 ...
head(india_districts_census_2011)
##   District.code     Year        State.Name District.name Population   Male
## 1             1 1-1-2011 JAMMU AND KASHMIR       Kupwara     870354 474190
## 2             2 1-1-2011 JAMMU AND KASHMIR        Badgam     753745 398041
## 3             3 1-1-2011 JAMMU AND KASHMIR   Leh(Ladakh)     133487  78971
## 4             4 1-1-2011 JAMMU AND KASHMIR        Kargil     140802  77785
## 5             5 1-1-2011 JAMMU AND KASHMIR         Punch     476835 251899
## 6             6 1-1-2011 JAMMU AND KASHMIR       Rajouri     642415 345351
##   Female Literate Male_Literate Female_Literate Male_ST Female_ST Workers
## 1 396164   439654        282823          156831   36913     33439  229064
## 2 355704   335649        207741          127908   12383     11529  214866
## 3  54516    93770         62834           30936   47543     48314   75079
## 4  63017    86236         56301           29935   62652     59684   51873
## 5 224936   261724        163333           98391   90274     85827  161393
## 6 297064   364109        224469          139640  121374    111441  290912
##   Male_Workers Female_Workers Main_Workers Marginal_Workers
## 1       190899          38165       123837           105227
## 2       162578          52288       132003            82863
## 3        53265          21814        57125            17954
## 4        39839          12034        28941            22932
## 5       117677          43716        73247            88146
## 6       184752         106160       130377           160535
##   Literate_Education Illiterate_Education Total_Education Age_Group_0_29
## 1             405015               289765          694780         600759
## 2             374972               342646          717618         503223
## 3              75493                32637          108130          70703
## 4              82938                39854          122792          87532
## 5             288430               164044          452474         304979
## 6             394977               242306          637283         404903
##   Age_Group_30_49
## 1          178435
## 2          160933
## 3           41515
## 4           35561
## 5          109818
## 6          153165
india_districts_census_2011_Rajasthan<-subset(india_districts_census_2011,india_districts_census_2011$State.Name=="RAJASTHAN") #Filtering Data for Rajasthan
India_selected<-india_districts_census_2011_Rajasthan %>% select (Year, State.Name, District.name, Population, Male, Female, Male_Literate,Female_Literate,Literate) #Selecting only the desired columns

sum(is.na(India_selected)) #Checking for NA values
## [1] 0
India_selected$Year=dmy(India_selected$Year)#Converting character to Date format
India_selected$District.name <-factor(c("Ajmer","Alwar","Banswara","Baran","Barmer","Bharatpur","Bhilwara","Bikaner","Bundi","Chittaurgarh","Churu","Dausa","Dhaulpur","Dungarpur","Ganganagar","Hanumangarh","Jaipur","Jaisalmer","Jalor","Jhalawar","Jhunjhunun","Jodhpur","Karauli","Kota","Nagaur","Pali","Pratapgarh","Rajsamand","Sawai Madhopur","Sikar","Sirohi","Tonk","Udaipur"),labels = c("Ajmer","Alwar","Banswara","Baran","Barmer","Bharatpur","Bhilwara","Bikaner","Bundi","Chittorgarh","Churu","Dausa","Dholpur","Dungarpur","Ganga.Nagar","Hanumangarh","Jaipur","Jaisalmer","Jalore","Jhalawar","Jhunjhunu","Jodhpur","Karauli","Kota","Nagaur","Pali","Pratapgarh","Rajsamand","Sawai.Madhopur","Sikar","Sirohi","Tonk","Udaipur"),ordered=TRUE) #Renaming Factor variables for consistency with second dataset 
str(India_selected)
## 'data.frame':    33 obs. of  9 variables:
##  $ Year           : Date, format: "2011-01-01" "2011-01-01" ...
##  $ State.Name     : chr  "RAJASTHAN" "RAJASTHAN" "RAJASTHAN" "RAJASTHAN" ...
##  $ District.name  : Ord.factor w/ 33 levels "Ajmer"<"Alwar"<..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Population     : int  1969168 1774692 2363937 2039547 2137045 3674179 2548462 1206516 1458248 1335551 ...
##  $ Male           : int  1043340 931184 1240801 1051446 1095896 1939026 1355726 653647 783639 704031 ...
##  $ Female         : int  925828 843508 1123136 988101 1041149 1735153 1192736 552869 674609 631520 ...
##  $ Male_Literate  : int  710908 624281 782399 696729 815834 1359829 943910 435721 531904 486184 ...
##  $ Female_Literate: int  482435 409855 496402 452540 554526 822647 536959 247377 273945 255564 ...
##  $ Literate       : int  1193343 1034136 1278801 1149269 1370360 2182476 1480869 683098 805849 741748 ...
#Dataset 2
gdp_Rajasthan2 <- read.csv("K:/Masters of Analytics/Data Preprocessing/Assignment 3/all-census-data/gdp_Rajasthan2.csv", stringsAsFactors = FALSE)#importing second datatset 
str(gdp_Rajasthan2) #Structure of dataset
## 'data.frame':    10 obs. of  35 variables:
##  $ Year          : chr  "01-01-11" "01-01-11" "01-01-07" "01-01-07" ...
##  $ Description   : chr  "GDP (in Rs. Cr.)" "Growth Rate % (YoY)" "GDP (in Rs. Cr.)" "Growth Rate % (YoY)" ...
##  $ Ajmer         : num  6592.1 20 7561.9 14.7 8010 ...
##  $ Alwar         : num  7.74e+03 6.97e-03 9.05e+03 1.68e+01 9.36e+03 ...
##  $ Banswara      : num  2313.26 -4.48 2365.84 2.27 2666.99 ...
##  $ Baran         : num  2952.26 8.15 2842.88 -3.7 3121.79 ...
##  $ Barmer        : num  3336.64 3.84 3805.29 14.05 4039.9 ...
##  $ Bharatpur     : num  3967.7 8.9 4350.16 9.64 4382.61 ...
##  $ Bhilwara      : num  5692.89 -8.99 7552.83 32.67 7625.84 ...
##  $ Bikaner       : num  4138.5 -0.52 4467.51 7.95 4843.96 ...
##  $ Bundi         : num  2581.9 10.93 2754.59 6.69 2820.98 ...
##  $ Chittorgarh   : num  4298.68 7.07 4834.47 12.46 4718.41 ...
##  $ Churu         : num  2793.28 1.22 3078.22 10.2 3432.66 ...
##  $ Dausa         : num  2143.22 5.25 2306.14 7.6 2443.27 ...
##  $ Dholpur       : num  1467.29 2.77 1569.34 6.95 1621.13 ...
##  $ Dungarpur     : num  1819.24 -1.05 2049.5 12.66 2265.93 ...
##  $ Ganga.Nagar   : num  5090.11 9.72 5627.78 10.56 5489.89 ...
##  $ Hanumangarh   : num  4062.44 13.56 4340.83 6.85 4667.09 ...
##  $ Jaipur        : num  19661.4 12.5 22409.2 14 23747.2 ...
##  $ Jaisalmer     : num  1634.2 45.6 1367.2 -16.3 1441.1 ...
##  $ Jalore        : num  2579.13 4.88 2964.91 14.96 3125.44 ...
##  $ Jhalawar      : num  2430.15 -4.25 2791.86 14.88 2718.78 ...
##  $ Jhunjhunu     : num  3568.14 4.88 3888.43 8.98 3951.56 ...
##  $ Jodhpur       : num  7451.3 12.2 8252.9 10.8 9029 ...
##  $ Karauli       : num  2352.03 3.26 2550.77 8.45 2726.71 ...
##  $ Kota          : num  4857.33 6.42 5205.05 7.16 5751.7 ...
##  $ Nagaur        : num  5207.15 7.67 5381.59 3.35 5743.51 ...
##  $ Pali          : num  4660 15.4 5528.6 18.6 5750.4 ...
##  $ Pratapgarh    : num  NA NA NA NA NA ...
##  $ Rajsamand     : num  2508.7 13.1 2918.6 16.3 3094.8 ...
##  $ Sawai.Madhopur: num  2355.29 7.5 2492.76 5.84 2574.8 ...
##  $ Sikar         : num  4465.79 2.35 5116.44 14.57 5259.51 ...
##  $ Sirohi        : num  2419.54 7.08 2613.48 8.02 2755.18 ...
##  $ Tonk          : num  2517.7 2.854 2525.46 0.308 2825.51 ...
##  $ Udaipur       : num  6625.06 -0.0382 7629.19 15.1565 8013.37 ...
head(gdp_Rajasthan2) #Sample Values
##       Year         Description       Ajmer        Alwar    Banswara
## 1 01-01-11    GDP (in Rs. Cr.) 6592.130000 7.742570e+03 2313.260000
## 2 01-01-11 Growth Rate % (YoY)   19.990644 6.974915e-03   -4.484946
## 3 01-01-07    GDP (in Rs. Cr.) 7561.850000 9.045130e+03 2365.840000
## 4 01-01-07 Growth Rate % (YoY)   14.710268 1.682335e+01    2.272983
## 5 01-01-08    GDP (in Rs. Cr.) 8010.040000 9.357610e+03 2666.990000
## 6 01-01-08 Growth Rate % (YoY)    5.926989 3.454677e+00   12.729094
##         Baran      Barmer    Bharatpur     Bhilwara      Bikaner
## 1 2952.260000 3336.640000 3967.7000000 5692.8900000 4138.5000000
## 2    8.153673    3.841653    8.9028199   -8.9891898   -0.5204139
## 3 2842.880000 3805.290000 4350.1600000 7552.8300000 4467.5100000
## 4   -3.704958   14.045567    9.6393377   32.6712794    7.9499819
## 5 3121.790000 4039.900000 4382.6100000 7625.8400000 4843.9600000
## 6    9.810826    6.165365    0.7459496    0.9666575    8.4263941
##         Bundi Chittorgarh       Churu       Dausa     Dholpur   Dungarpur
## 1 2581.900000 4298.680000 2793.280000 2143.220000 1467.290000 1819.240000
## 2   10.931612    7.066304    1.222668    5.248116    2.769392   -1.047055
## 3 2754.590000 4834.470000 3078.220000 2306.140000 1569.340000 2049.500000
## 4    6.688485   12.464059   10.200911    7.601646    6.954999   12.656934
## 5 2820.980000 4718.410000 3432.660000 2443.270000 1621.130000 2265.930000
## 6    2.410159   -2.400677   11.514447    5.946300    3.300113   10.560137
##   Ganga.Nagar Hanumangarh       Jaipur   Jaisalmer      Jalore   Jhalawar
## 1 5090.110000 4062.440000 19661.410000 1634.200000 2579.130000 2430.15000
## 2    9.724531   13.562895    12.477789   45.597420    4.875936   -4.25469
## 3 5627.780000 4340.830000 22409.170000 1367.180000 2964.910000 2791.86000
## 4   10.563033    6.852778    13.975396  -16.339493   14.957757   14.88427
## 5 5489.890000 4667.090000 23747.200000 1441.140000 3125.440000 2718.78000
## 6   -2.450167    7.516074     5.970904    5.409675    5.414330   -2.61761
##     Jhunjhunu     Jodhpur     Karauli        Kota      Nagaur        Pali
## 1 3568.140000 7451.280000 2352.030000 4857.330000 5207.150000 4659.980000
## 2    4.883290   12.216720    3.262020    6.423773    7.667162   15.385453
## 3 3888.430000 8252.850000 2550.770000 5205.050000 5381.590000 5528.590000
## 4    8.976385   10.757481    8.449722    7.158665    3.350009   18.639780
## 5 3951.560000 9028.950000 2726.710000 5751.700000 5743.510000 5750.380000
## 6    1.623534    9.404024    6.897525   10.502301    6.725150    4.011692
##   Pratapgarh   Rajsamand Sawai.Madhopur       Sikar      Sirohi
## 1         NA 2508.670000    2355.290000 4465.790000 2419.540000
## 2         NA   13.140247       7.502328    2.347034    7.083457
## 3         NA 2918.570000    2492.760000 5116.440000 2613.480000
## 4         NA   16.339335       5.836649   14.569651    8.015573
## 5         NA 3094.770000    2574.800000 5259.510000 2755.180000
## 6         NA    6.037203       3.291131    2.796280    5.421890
##           Tonk       Udaipur
## 1 2517.7000000 6625.06000000
## 2    2.8535245   -0.03817375
## 3 2525.4600000 7629.19000000
## 4    0.3082178   15.15654198
## 5 2825.5100000 8013.37000000
## 6   11.8810039    5.03565909
#Each variable does not have its own column so tidy up the data so that the two datasets can be merged to get a tidy single dataset 
gdp_Rajasthan2$Year=mdy(gdp_Rajasthan2$Year) #Converting character to Date format
gdp_Rajasthan2_1<-gather(gdp_Rajasthan2,key = "District", value = "Value_data",3:35) #As column Names are district values we convert from wide to long format
gdp_Rajasthan2_2<-spread(gdp_Rajasthan2_1,key=Description,value=Value_data)  #Multiple variables are stored in rows , spreading so that each variable has its own column
gdp_Rajasthan2_3<-subset(gdp_Rajasthan2_2,gdp_Rajasthan2_2$Year=="2011-01-01") #Filter 2011 Data
gdp_Rajasthan2_3$District<-factor(c("Ajmer","Alwar","Banswara","Baran","Barmer","Bharatpur","Bhilwara","Bikaner","Bundi","Chittorgarh","Churu","Dausa","Dholpur","Dungarpur","Ganga.Nagar","Hanumangarh","Jaipur","Jaisalmer","Jalore","Jhalawar","Jhunjhunu","Jodhpur","Karauli","Kota","Nagaur","Pali","Pratapgarh","Rajsamand","Sawai.Madhopur","Sikar","Sirohi","Tonk","Udaipur"),labels = c("Ajmer","Alwar","Banswara","Baran","Barmer","Bharatpur","Bhilwara","Bikaner","Bundi","Chittorgarh","Churu","Dausa","Dholpur","Dungarpur","Ganga.Nagar","Hanumangarh","Jaipur","Jaisalmer","Jalore","Jhalawar","Jhunjhunu","Jodhpur","Karauli","Kota","Nagaur","Pali","Pratapgarh","Rajsamand","Sawai.Madhopur","Sikar","Sirohi","Tonk","Udaipur"),ordered=TRUE) #Converting to  Factor variables and oredering the factor variable 
str(gdp_Rajasthan2_3)
## 'data.frame':    33 obs. of  4 variables:
##  $ Year               : Date, format: "2011-01-01" "2011-01-01" ...
##  $ District           : Ord.factor w/ 33 levels "Ajmer"<"Alwar"<..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ GDP (in Rs. Cr.)   : num  6592 7743 2313 2952 3337 ...
##  $ Growth Rate % (YoY): num  19.99064 0.00697 -4.48495 8.15367 3.84165 ...
head(gdp_Rajasthan2_3)
##           Year  District GDP (in Rs. Cr.) Growth Rate % (YoY)
## 133 2011-01-01     Ajmer          6592.13        19.990644120
## 134 2011-01-01     Alwar          7742.57         0.006974915
## 135 2011-01-01  Banswara          2313.26        -4.484945579
## 136 2011-01-01     Baran          2952.26         8.153673128
## 137 2011-01-01    Barmer          3336.64         3.841653181
## 138 2011-01-01 Bharatpur          3967.70         8.902819940
#Checking for NAs
sum(is.na(gdp_Rajasthan2_3))
## [1] 2
#Merging the two datasets 
Final_dataset<-India_selected %>% left_join(gdp_Rajasthan2_3, c("District.name" = "District","Year"="Year")) #Left Join the two dataset to merge them meaningfully on common columns
#Renaming Column Names
colnames(Final_dataset)[10] <- "GDP_Cr"
colnames(Final_dataset)[11] <- "Growth_Rate_%_(YoY)"

head(Final_dataset)
##         Year State.Name District.name Population    Male  Female
## 1 2011-01-01  RAJASTHAN         Ajmer    1969168 1043340  925828
## 2 2011-01-01  RAJASTHAN         Alwar    1774692  931184  843508
## 3 2011-01-01  RAJASTHAN      Banswara    2363937 1240801 1123136
## 4 2011-01-01  RAJASTHAN         Baran    2039547 1051446  988101
## 5 2011-01-01  RAJASTHAN        Barmer    2137045 1095896 1041149
## 6 2011-01-01  RAJASTHAN     Bharatpur    3674179 1939026 1735153
##   Male_Literate Female_Literate Literate  GDP_Cr Growth_Rate_%_(YoY)
## 1        710908          482435  1193343 6592.13        19.990644120
## 2        624281          409855  1034136 7742.57         0.006974915
## 3        782399          496402  1278801 2313.26        -4.484945579
## 4        696729          452540  1149269 2952.26         8.153673128
## 5        815834          554526  1370360 3336.64         3.841653181
## 6       1359829          822647  2182476 3967.70         8.902819940

Understand

Final Dataset haas 11 variables with different types like character, integer, Date, factor, numeric and is a merged dataset in tidy format. Different datatype transformation has been applied to reach this stage.

str(Final_dataset)
## 'data.frame':    33 obs. of  11 variables:
##  $ Year               : Date, format: "2011-01-01" "2011-01-01" ...
##  $ State.Name         : chr  "RAJASTHAN" "RAJASTHAN" "RAJASTHAN" "RAJASTHAN" ...
##  $ District.name      : Ord.factor w/ 33 levels "Ajmer"<"Alwar"<..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Population         : int  1969168 1774692 2363937 2039547 2137045 3674179 2548462 1206516 1458248 1335551 ...
##  $ Male               : int  1043340 931184 1240801 1051446 1095896 1939026 1355726 653647 783639 704031 ...
##  $ Female             : int  925828 843508 1123136 988101 1041149 1735153 1192736 552869 674609 631520 ...
##  $ Male_Literate      : int  710908 624281 782399 696729 815834 1359829 943910 435721 531904 486184 ...
##  $ Female_Literate    : int  482435 409855 496402 452540 554526 822647 536959 247377 273945 255564 ...
##  $ Literate           : int  1193343 1034136 1278801 1149269 1370360 2182476 1480869 683098 805849 741748 ...
##  $ GDP_Cr             : num  6592 7743 2313 2952 3337 ...
##  $ Growth_Rate_%_(YoY): num  19.99064 0.00697 -4.48495 8.15367 3.84165 ...
attributes(Final_dataset)
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33
## 
## $names
##  [1] "Year"                "State.Name"          "District.name"      
##  [4] "Population"          "Male"                "Female"             
##  [7] "Male_Literate"       "Female_Literate"     "Literate"           
## [10] "GDP_Cr"              "Growth_Rate_%_(YoY)"

Tidy & Manipulate Data I

#As two datasets were merged Dataset 2 was not in Tidy Format as Each variable did not have its own column so the data was done tidy before merging it
gdp_Rajasthan2_1<-gather(gdp_Rajasthan2,key = "District", value = "Value_data",3:35) #Gathers all Districts which are present as column headers in one column
gdp_Rajasthan2_2<-spread(gdp_Rajasthan2_1,key=Description,value=Value_data)  #Convert the variables into individual columns as per tidy data principal 
gdp_Rajasthan2_3<-subset(gdp_Rajasthan2_2,gdp_Rajasthan2_2$Year=="2011-01-01") #Selecting only 2011 Data
head(gdp_Rajasthan2_3)
##           Year  District GDP (in Rs. Cr.) Growth Rate % (YoY)
## 133 2011-01-01     Ajmer          6592.13        19.990644120
## 134 2011-01-01     Alwar          7742.57         0.006974915
## 135 2011-01-01  Banswara          2313.26        -4.484945579
## 136 2011-01-01     Baran          2952.26         8.153673128
## 137 2011-01-01    Barmer          3336.64         3.841653181
## 138 2011-01-01 Bharatpur          3967.70         8.902819940
#We use the above data to left join with dataset 1 to get the Final Dataset
head(Final_dataset)
##         Year State.Name District.name Population    Male  Female
## 1 2011-01-01  RAJASTHAN         Ajmer    1969168 1043340  925828
## 2 2011-01-01  RAJASTHAN         Alwar    1774692  931184  843508
## 3 2011-01-01  RAJASTHAN      Banswara    2363937 1240801 1123136
## 4 2011-01-01  RAJASTHAN         Baran    2039547 1051446  988101
## 5 2011-01-01  RAJASTHAN        Barmer    2137045 1095896 1041149
## 6 2011-01-01  RAJASTHAN     Bharatpur    3674179 1939026 1735153
##   Male_Literate Female_Literate Literate  GDP_Cr Growth_Rate_%_(YoY)
## 1        710908          482435  1193343 6592.13        19.990644120
## 2        624281          409855  1034136 7742.57         0.006974915
## 3        782399          496402  1278801 2313.26        -4.484945579
## 4        696729          452540  1149269 2952.26         8.153673128
## 5        815834          554526  1370360 3336.64         3.841653181
## 6       1359829          822647  2182476 3967.70         8.902819940
#Final Dataset is in Tidy format because Each variable has its own column, Each observation for district has its own row and Each value has its own cell.

Tidy & Manipulate Data II

#Create new Mutated variables in original dataset, We are dividing number of Males and Females to Total populationindividually to get ratios for analysis
Final_dataset<-mutate(Final_dataset,
       Male_Ratio = Male/Population,
       Female_Ratio = Female / Population)
head(Final_dataset)
##         Year State.Name District.name Population    Male  Female
## 1 2011-01-01  RAJASTHAN         Ajmer    1969168 1043340  925828
## 2 2011-01-01  RAJASTHAN         Alwar    1774692  931184  843508
## 3 2011-01-01  RAJASTHAN      Banswara    2363937 1240801 1123136
## 4 2011-01-01  RAJASTHAN         Baran    2039547 1051446  988101
## 5 2011-01-01  RAJASTHAN        Barmer    2137045 1095896 1041149
## 6 2011-01-01  RAJASTHAN     Bharatpur    3674179 1939026 1735153
##   Male_Literate Female_Literate Literate  GDP_Cr Growth_Rate_%_(YoY)
## 1        710908          482435  1193343 6592.13        19.990644120
## 2        624281          409855  1034136 7742.57         0.006974915
## 3        782399          496402  1278801 2313.26        -4.484945579
## 4        696729          452540  1149269 2952.26         8.153673128
## 5        815834          554526  1370360 3336.64         3.841653181
## 6       1359829          822647  2182476 3967.70         8.902819940
##   Male_Ratio Female_Ratio
## 1  0.5298380    0.4701620
## 2  0.5247018    0.4752982
## 3  0.5248875    0.4751125
## 4  0.5155292    0.4844708
## 5  0.5128090    0.4871910
## 6  0.5277440    0.4722560
#New Variables to calculate Male and Female ratio to the total Population was done

Scan I

#Checking for NAs
sum(is.na(Final_dataset))
## [1] 2
# mean imputation (for numerical variables)
Final_dataset$GDP_Cr <- impute(Final_dataset$GDP_Cr, fun = mean) #Missing value NA value is imputed with mean value of all the districts
Final_dataset$GDP_Cr<-as.numeric(Final_dataset$GDP_Cr) #Ensuring its type is numeric after imputing
#Replacing NA value with mean value of all the districts
Final_dataset$`Growth_Rate_%_(YoY)` <- impute(Final_dataset$`Growth_Rate_%_(YoY)`, fun = mean) #Missing value NA value is imputed with mean value of all the districts
Final_dataset$`Growth_Rate_%_(YoY)`<-as.numeric(Final_dataset$`Growth_Rate_%_(YoY)`) #Ensuring its type is numeric after imputing
sum(is.na(Final_dataset)) # Confirm if any missing values are present
## [1] 0

Scan II

#Checking for Outliers by plotting histogram of variables and their locations. The outliers which lie outside the lower limit of the outlier fence on the box plot are capped with the value of 5th percentile and those that lie above the upper limit are capped with 95th percentile value.

cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
} #Capping function created to get 5th and 95th Percentile values as per the case

Final_dataset$Population %>%  boxplot(main="Population (Checking for outlier)")

z.scores <- Final_dataset$Population %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2515 -0.6124 -0.2210  0.0000  0.4191  4.0454
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier for Population Variable

Final_dataset$Population <- Final_dataset$Population %>% cap()
Final_dataset$Population %>%  boxplot(main="Population after Outlier capping")

#Male 
Final_dataset$Male %>%  boxplot(main="Male (Checking for outlier)")

z.scores <- Final_dataset$Male %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2163 -0.6344 -0.2391  0.0000  0.4195  4.0643
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier
Final_dataset$Male <- Final_dataset$Male %>% cap()
Final_dataset$Male %>%  boxplot(main="Male after Outlier Capping")

#Female
Final_dataset$Female %>%  boxplot(main="Female (Checking for Oulier)")

z.scores <- Final_dataset$Female %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2886 -0.6061 -0.2009  0.0000  0.3592  4.0198
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier
Final_dataset$Female <- Final_dataset$Female %>% cap()
Final_dataset$Female %>%  boxplot(main="Female (After Outlier Removal)")

#Male Literate
Final_dataset$Male_Literate %>%  boxplot(main="Male Literate (Checking for Oulier)")

z.scores <- Final_dataset$Male_Literate %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1747 -0.5811 -0.2166  0.0000  0.2269  4.2528
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier
Final_dataset$Male_Literate <- Final_dataset$Male_Literate %>% cap()
Final_dataset$Male_Literate %>%  boxplot(main="Male Literate (After Outlier Removal)")

#Female Literate
Final_dataset$Female_Literate %>%  boxplot(main="Female_Literate (Checking for Oulier)")

z.scores <- Final_dataset$Female_Literate %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1624 -0.5960 -0.1257  0.0000  0.3210  4.4096
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier
Final_dataset$Female_Literate <- Final_dataset$Female_Literate %>% cap()
Final_dataset$Female_Literate %>%  boxplot(main="Female_Literate (After Outlier Removal)")

#Literate
Final_dataset$Literate %>%  boxplot(main="Literate (Checking for Oulier)")

z.scores <- Final_dataset$Literate %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1717 -0.5782 -0.1731  0.0000  0.2898  4.3237
which( abs(z.scores) >3 ) #Outlier Location
## [1] 12
#Capping the outlier
Final_dataset$Literate <- Final_dataset$Literate %>% cap()
Final_dataset$Literate %>%  boxplot(main="Literate (After Outlier Removal)")

#GDP in Crore Rupees 
Final_dataset$GDP_Cr %>% boxplot(main="GDP (Checking for Oulier)")

z.scores <- Final_dataset$GDP_Cr %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.8631 -0.5654 -0.2136  0.0000  0.1850  4.7622
which( abs(z.scores) >3 ) #Outlier Location
## [1] 17
#Capping the outlier
Final_dataset$GDP_Cr <- Final_dataset$GDP_Cr %>% cap()
Final_dataset$GDP_Cr %>%  boxplot(main="GDP (After Outlier Removal)")

#Growth Rate
Final_dataset$`Growth_Rate_%_(YoY)` %>%  boxplot(main="Growth Rate (Checking for Oulier)")

z.scores <- Final_dataset$`Growth_Rate_%_(YoY)`%>%  scores(type = "z")
z.scores %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.70247 -0.48063 -0.04123  0.00000  0.31453  4.18097
which( abs(z.scores) >3 ) #Outlier Location
## [1] 18
#Capping the outlier
Final_dataset$`Growth_Rate_%_(YoY)` <- Final_dataset$`Growth_Rate_%_(YoY)` %>% cap()
Final_dataset$`Growth_Rate_%_(YoY)` %>%  boxplot(main="Growth Rate (After Outlier Removal)")

#Male Ratio Literate
Final_dataset$Male_Ratio %>%  boxplot(main="Male Ratio (Checking for Oulier)")

z.scores <- Final_dataset$Male_Ratio %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.5387 -0.5675 -0.2590  0.0000  0.6023  2.1555
which( abs(z.scores) >3 ) #Outlier Location
## integer(0)
#No Outliers present

#Female Ratio
Final_dataset$Female_Ratio %>%  boxplot(main="Female_Ratio (Checking for Oulier)")

z.scores <- Final_dataset$Female_Ratio %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.1555 -0.6023  0.2590  0.0000  0.5675  1.5387
which( abs(z.scores) >3 ) #Outlier Location
## integer(0)
#No Outliers present

Transform

#Data Transformation changing the right skewness of data to normal distribution
hist(Final_dataset$Population, main="Population Histogram") # PLotting Histogram to check skewness of variable

log10_Population <-log10 (Final_dataset$Population) # As the variable it right skewed it is applied log10 transformation to change it to normal distribution
hist(log10_Population,main="Log10 Transformed Population") #Checking for normal curve