Categorical Data Visualisation project

Source file ⇒ categoricalvisualisationproject.rmd

Exploratory data analysis

REVEAL THYSELF……………

library(foreign)
allvariables.1<-read.spss("all variables.111.sav")
#allvariables.1<-read.csv("all variables.1.csv",header = T)

#converting lists into a data frame only when importing an SPSS file
allvariables.1<-as.data.frame(allvariables.1)
class(allvariables.1)

## [1] "data.frame"

class(allvariables.1)

## [1] "data.frame"

#knowing the  class of each col
sapply(allvariables.1,class)

##        STATEID         DISTID           VNID          ROOMS         INCOME 
##       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
##         EDU.HH          WATER         RICE.P    ELEC.ACCESS       ELEC.HRS 
##       "factor"       "factor"      "numeric"       "factor"      "numeric" 
##            NFE         ASSETS        NADULTM        NADULTF        NCHILDM 
##       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
##        NCHILDF         NTEENM         NTEENF      URBAN2011      NFE.TYPES 
##      "numeric"      "numeric"      "numeric"       "factor"       "factor" 
##         NADULT         NCHILD          NTEEN     ELEC.HRS.3     ELEC.HRS.4 
##      "numeric"      "numeric"      "numeric"       "factor"       "factor" 
##       filter_. ELEC.HRS.3.NEW 
##       "factor"       "factor"

str(allvariables.1)

## 'data.frame':    39954 obs. of  27 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NFE.TYPES     : Factor w/ 71 levels "Agriculture 0",..: 49 NA 65 NA NA NA 37 NA NA 46 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...

head(allvariables.1)

##              STATEID DISTID VNID ROOMS  INCOME       EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01      2    1    12  176100       none 0 Yes 1     17
## 2 Jammu & Kashmir 01      2    1    10 1039150       none 0 Yes 1     20
## 3 Jammu & Kashmir 01      2    1     3  182340 Secondary 10 Yes 1     15
## 4 Jammu & Kashmir 01      2    1     4   90760       none 0 Yes 1     20
## 5 Jammu & Kashmir 01      2    1    10  212600       none 0 Yes 1     20
## 6 Jammu & Kashmir 01      2    1     5  152100       none 0 Yes 1     12
##   ELEC.ACCESS ELEC.HRS  NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1       Yes 1       12 No 0     20       2       2       2       2      1
## 2       Yes 1        8 No 0     24       5       3       3       3      1
## 3       Yes 1        8 No 0     22       1       2       1       1      1
## 4       Yes 1        8 No 0     16       1       1       1       1      0
## 5       Yes 1        8 No 0     17       3       3       1       1      2
## 6       Yes 1       14 No 0     12       4       3       2       2      0
##   NTEENF URBAN2011         NFE.TYPES NADULT NCHILD NTEEN ELEC.HRS.3
## 1      0   rural 0 Land transport 70      4      4     1       1-16
## 2      1   rural 0              <NA>      8      6     2       1-16
## 3      1   rural 0        Medical 93      3      2     2       1-16
## 4      1   rural 0              <NA>      2      2     1       1-16
## 5      0   rural 0              <NA>      6      2     2       1-16
## 6      0   rural 0              <NA>      7      4     0       1-16
##   ELEC.HRS.4 filter_. ELEC.HRS.3.NEW
## 1      12-18 Selected           1-16
## 2       6-12 Selected           1-16
## 3       6-12 Selected           1-16
## 4       6-12 Selected           1-16
## 5       6-12 Selected           1-16
## 6      12-18 Selected           1-16

tail(allvariables.1)

##             STATEID DISTID VNID ROOMS INCOME       EDU.HH WATER RICE.P
## 39949 Tamil Nadu 33     30   11     5  27000  7th class 7  No 0     26
## 39950 Tamil Nadu 33     30   11     2   9000       none 0 Yes 1     26
## 39951 Tamil Nadu 33     30   11     3 110900 Secondary 10  No 0     18
## 39952 Tamil Nadu 33     30   11     2  15000  5th class 5  No 0     26
## 39953 Tamil Nadu 33     30   11     1  69040  7th class 7  No 0     23
## 39954 Tamil Nadu 33     30   11     2  37000  6th class 6  No 0     23
##       ELEC.ACCESS ELEC.HRS   NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39949       Yes 1        6 Yes 1     26       1       1       0       0
## 39950        No 0       NA  No 0      8       0       1       0       0
## 39951       Yes 1        6  No 0     17       1       2       1       0
## 39952       Yes 1        6 Yes 1     19       2       1       1       1
## 39953       Yes 1        8  No 0     15       1       3       0       1
## 39954       Yes 1        6 Yes 1     18       1       1       0       1
##       NTEENM NTEENF URBAN2011            NFE.TYPES NADULT NCHILD NTEEN
## 39949      0      0   urban 1  Retail household 67      2      0     0
## 39950      0      0   urban 1     Air transport 72      1      0     0
## 39951      0      2   urban 1   Retail textiles 66      3      1     2
## 39952      0      0   urban 1        Retail nec 68      3      2     0
## 39953      0      0   urban 1 Personal services 96      4      1     0
## 39954      1      1   urban 1       Retail food 65      2      1     2
##       ELEC.HRS.3 ELEC.HRS.4     filter_. ELEC.HRS.3.NEW
## 39949       1-16       6-12 Not Selected           1-16
## 39950       <NA>       <NA> Not Selected           <NA>
## 39951       1-16       6-12 Not Selected           1-16
## 39952       1-16       6-12 Not Selected           1-16
## 39953       1-16       6-12 Not Selected           1-16
## 39954       1-16       6-12 Not Selected           1-16

dim(allvariables.1)

## [1] 39954    27

summary(allvariables.1)  # gives the info about NAs

##               STATEID          DISTID           VNID       
##  Karnataka 29     : 3865   Min.   : 1.00   Min.   : 1.000  
##  Uttar Pradesh 09 : 3824   1st Qu.: 7.00   1st Qu.: 2.000  
##  Maharashtra 27   : 3309   Median :12.00   Median : 4.000  
##  Madhya Pradesh 23: 3123   Mean   :15.43   Mean   : 5.608  
##  Rajasthan 08     : 2707   3rd Qu.:21.00   3rd Qu.: 7.000  
##  West Bengal 19   : 2435   Max.   :68.00   Max.   :39.000  
##  (Other)          :20691                                   
##      ROOMS            INCOME                  EDU.HH        WATER      
##  Min.   : 0.000   Min.   :-1037040   none 0      :24520   No 0 :27201  
##  1st Qu.: 2.000   1st Qu.:   38220   5th class 5 : 3514   Yes 1:12613  
##  Median : 2.000   Median :   72470   Secondary 10: 2057   NA's :  140  
##  Mean   : 2.684   Mean   :  125336   4th class 4 : 1673                
##  3rd Qu.: 3.000   3rd Qu.:  141500   8th class 8 : 1524                
##  Max.   :50.000   Max.   :11360000   (Other)     : 6390                
##  NA's   :172                         NA's        :  276                
##      RICE.P       ELEC.ACCESS      ELEC.HRS        NFE       
##  Min.   :  0.00   No 0 : 4970   Min.   : 0.00   No 0 :31564  
##  1st Qu.: 18.00   Yes 1:34840   1st Qu.: 9.00   Yes 1: 8390  
##  Median : 20.00   NA's :  144   Median :16.00                
##  Mean   : 21.92                 Mean   :15.32                
##  3rd Qu.: 26.00                 3rd Qu.:22.00                
##  Max.   :120.00                 Max.   :24.00                
##                                 NA's   :5197                 
##      ASSETS         NADULTM         NADULTF         NCHILDM       
##  Min.   : 0.00   Min.   :0.000   Min.   :0.000   Min.   : 0.0000  
##  1st Qu.:10.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 0.0000  
##  Median :16.00   Median :1.000   Median :1.000   Median : 0.0000  
##  Mean   :15.39   Mean   :1.424   Mean   :1.492   Mean   : 0.7157  
##  3rd Qu.:21.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.: 1.0000  
##  Max.   :33.00   Max.   :9.000   Max.   :9.000   Max.   :10.0000  
##  NA's   :18                                                       
##     NCHILDF            NTEENM           NTEENF         URBAN2011    
##  Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000   rural 0:26134  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000   urban 1:13820  
##  Median : 0.0000   Median :0.0000   Median :0.0000                  
##  Mean   : 0.6566   Mean   :0.2852   Mean   :0.2926                  
##  3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000                  
##  Max.   :10.0000   Max.   :5.0000   Max.   :5.0000                  
##                                                                     
##              NFE.TYPES         NADULT           NCHILD      
##  Agriculture 0    : 7570   Min.   : 0.000   Min.   : 0.000  
##  Construction 50  : 3190   1st Qu.: 2.000   1st Qu.: 0.000  
##  Public admin 90  : 1565   Median : 2.000   Median : 1.000  
##  Retail food 65   : 1230   Mean   : 2.917   Mean   : 1.372  
##  Land transport 70:  954   3rd Qu.: 4.000   3rd Qu.: 2.000  
##  (Other)          : 9042   Max.   :18.000   Max.   :18.000  
##  NA's             :16403                                    
##      NTEEN            ELEC.HRS.3    ELEC.HRS.4            filter_.    
##  Min.   :0.0000   No Access:   22   0-6  : 2975   Not Selected:36056  
##  1st Qu.:0.0000   1-16     :16091   6-12 : 8550   Selected    : 3898  
##  Median :0.0000   17-24    :18644   12-18: 6252                       
##  Mean   :0.5778   NA's     : 5197   18-24:16980                       
##  3rd Qu.:1.0000                     NA's : 5197                       
##  Max.   :7.0000                                                       
##                                                                       
##    ELEC.HRS.3.NEW 
##  No Access:   92  
##  1-16     :17334  
##  16-24    :17331  
##  NA's     : 5197  
##                   
##                   
##

#utils::View(allvariables.1) Shows the entire data set

# allvariables.1$STATEID<-as.factor(as.character(allvariables.1$STATEID))
# allvariables.1$EDU.HH<-as.factor(as.character(allvariables.1$EDU.HH))
# allvariables.1$WATER<-as.factor(as.character(allvariables.1$WATER))
# allvariables.1$ ELEC.ACCESS<-as.factor(as.character(allvariables.1$ ELEC.ACCESS))
# allvariables.1$NFE<-as.factor(as.character(allvariables.1$NFE))
# allvariables.1$URBAN2011<-as.factor(as.character(allvariables.1$URBAN2011))
# allvariables.1$NFE.TYPES<-as.factor(as.character(allvariables.1$NFE.TYPES))
# allvariables.1$ELEC.HRS.3<-as.factor(as.character(allvariables.1$ELEC.HRS.3))
# allvariables.1$ELEC.HRS.4<-as.factor(as.character(allvariables.1$ELEC.HRS.4))
# allvariables.1$filter_.<-as.factor(as.character(allvariables.1$filter_.))

# sapply(allvariables.1,class)
# str(allvariables.1)
# summary(allvariables.1)


#total no of rows having NAs
rownos<-which(apply(is.na(allvariables.1),1,sum)>0)
length(rownos)  # so we have at 19278 rows with atleast one NA

## [1] 19278

rowsums<-sum(apply(is.na(allvariables.1),1,sum)>0)
rowsums

## [1] 19278

#NAs count coulmn wise
na_count <-sapply(allvariables.1, function(y) sum(length(which(is.na(y)))))
(na_count <- data.frame(na_count))

##                na_count
## STATEID               0
## DISTID                0
## VNID                  0
## ROOMS               172
## INCOME                0
## EDU.HH              276
## WATER               140
## RICE.P                0
## ELEC.ACCESS         144
## ELEC.HRS           5197
## NFE                   0
## ASSETS               18
## NADULTM               0
## NADULTF               0
## NCHILDM               0
## NCHILDF               0
## NTEENM                0
## NTEENF                0
## URBAN2011             0
## NFE.TYPES         16403
## NADULT                0
## NCHILD                0
## NTEEN                 0
## ELEC.HRS.3         5197
## ELEC.HRS.4         5197
## filter_.              0
## ELEC.HRS.3.NEW     5197

#or
apply(allvariables.1, 2, function(x)sum(is.na(x)))

##        STATEID         DISTID           VNID          ROOMS         INCOME 
##              0              0              0            172              0 
##         EDU.HH          WATER         RICE.P    ELEC.ACCESS       ELEC.HRS 
##            276            140              0            144           5197 
##            NFE         ASSETS        NADULTM        NADULTF        NCHILDM 
##              0             18              0              0              0 
##        NCHILDF         NTEENM         NTEENF      URBAN2011      NFE.TYPES 
##              0              0              0              0          16403 
##         NADULT         NCHILD          NTEEN     ELEC.HRS.3     ELEC.HRS.4 
##              0              0              0           5197           5197 
##       filter_. ELEC.HRS.3.NEW 
##              0           5197

#Lets see which col has NAs
colnos<-which(apply(is.na(allvariables.1),2,sum)>0)
colnos

##          ROOMS         EDU.HH          WATER    ELEC.ACCESS       ELEC.HRS 
##              4              6              7              9             10 
##         ASSETS      NFE.TYPES     ELEC.HRS.3     ELEC.HRS.4 ELEC.HRS.3.NEW 
##             12             20             24             25             27

#lets remove var NFE.TYPES is has huge no of NAs
allvariables.1$NFE.TYPES<-NULL

#Removing rows having even one NA

allvariables.1nonas<-na.omit(allvariables.1)
str(allvariables.1nonas)

## 'data.frame':    34442 obs. of  26 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
##   .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...

summary(allvariables.1nonas)

##               STATEID          DISTID          VNID       
##  Karnataka 29     : 3546   Min.   : 1.0   Min.   : 1.000  
##  Maharashtra 27   : 3095   1st Qu.: 6.0   1st Qu.: 2.000  
##  Madhya Pradesh 23: 2532   Median :12.0   Median : 4.000  
##  Uttar Pradesh 09 : 2326   Mean   :14.6   Mean   : 5.788  
##  Rajasthan 08     : 2296   3rd Qu.:20.0   3rd Qu.: 7.000  
##  Andhra Pradesh 28: 2118   Max.   :68.0   Max.   :39.000  
##  (Other)          :18529                                  
##      ROOMS            INCOME                  EDU.HH        WATER      
##  Min.   : 1.000   Min.   :-1037040   none 0      :20413   No 0 :22154  
##  1st Qu.: 2.000   1st Qu.:   43500   5th class 5 : 3201   Yes 1:12288  
##  Median : 2.000   Median :   80500   Secondary 10: 1960                
##  Mean   : 2.791   Mean   :  136009   4th class 4 : 1551                
##  3rd Qu.: 4.000   3rd Qu.:  154000   8th class 8 : 1417                
##  Max.   :50.000   Max.   :11360000   2nd class 2 : 1246                
##                                      (Other)     : 4654                
##      RICE.P       ELEC.ACCESS      ELEC.HRS        NFE       
##  Min.   :  0.00   No 0 :    0   Min.   : 0.00   No 0 :26760  
##  1st Qu.: 18.00   Yes 1:34442   1st Qu.: 9.00   Yes 1: 7682  
##  Median : 22.00                 Median :16.00                
##  Mean   : 22.59                 Mean   :15.32                
##  3rd Qu.: 27.00                 3rd Qu.:22.00                
##  Max.   :120.00                 Max.   :24.00                
##                                                              
##      ASSETS         NADULTM         NADULTF         NCHILDM      
##  Min.   : 1.00   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:12.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :17.00   Median :1.000   Median :1.000   Median :0.0000  
##  Mean   :16.68   Mean   :1.463   Mean   :1.528   Mean   :0.6926  
##  3rd Qu.:21.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :33.00   Max.   :9.000   Max.   :9.000   Max.   :8.0000  
##                                                                  
##     NCHILDF            NTEENM           NTEENF         URBAN2011    
##  Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000   rural 0:21240  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000   urban 1:13202  
##  Median : 0.0000   Median :0.0000   Median :0.0000                  
##  Mean   : 0.6318   Mean   :0.2868   Mean   :0.2926                  
##  3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000                  
##  Max.   :10.0000   Max.   :5.0000   Max.   :5.0000                  
##                                                                     
##      NADULT           NCHILD           NTEEN            ELEC.HRS.3   
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.0000   No Access:   22  
##  1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.:0.0000   1-16     :15932  
##  Median : 3.000   Median : 1.000   Median :0.0000   17-24    :18488  
##  Mean   : 2.991   Mean   : 1.324   Mean   :0.5795                    
##  3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.:1.0000                    
##  Max.   :18.000   Max.   :18.000   Max.   :7.0000                    
##                                                                      
##  ELEC.HRS.4            filter_.       ELEC.HRS.3.NEW 
##  0-6  : 2930   Not Selected:30607   No Access:   91  
##  6-12 : 8491   Selected    : 3835   1-16     :17168  
##  12-18: 6189                        16-24    :17183  
##  18-24:16832                                         
##                                                      
##                                                      
##

dim(allvariables.1nonas)

## [1] 34442    26

head(allvariables.1nonas)

##              STATEID DISTID VNID ROOMS  INCOME       EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01      2    1    12  176100       none 0 Yes 1     17
## 2 Jammu & Kashmir 01      2    1    10 1039150       none 0 Yes 1     20
## 3 Jammu & Kashmir 01      2    1     3  182340 Secondary 10 Yes 1     15
## 4 Jammu & Kashmir 01      2    1     4   90760       none 0 Yes 1     20
## 5 Jammu & Kashmir 01      2    1    10  212600       none 0 Yes 1     20
## 6 Jammu & Kashmir 01      2    1     5  152100       none 0 Yes 1     12
##   ELEC.ACCESS ELEC.HRS  NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1       Yes 1       12 No 0     20       2       2       2       2      1
## 2       Yes 1        8 No 0     24       5       3       3       3      1
## 3       Yes 1        8 No 0     22       1       2       1       1      1
## 4       Yes 1        8 No 0     16       1       1       1       1      0
## 5       Yes 1        8 No 0     17       3       3       1       1      2
## 6       Yes 1       14 No 0     12       4       3       2       2      0
##   NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4 filter_.
## 1      0   rural 0      4      4     1       1-16      12-18 Selected
## 2      1   rural 0      8      6     2       1-16       6-12 Selected
## 3      1   rural 0      3      2     2       1-16       6-12 Selected
## 4      1   rural 0      2      2     1       1-16       6-12 Selected
## 5      0   rural 0      6      2     2       1-16       6-12 Selected
## 6      0   rural 0      7      4     0       1-16      12-18 Selected
##   ELEC.HRS.3.NEW
## 1           1-16
## 2           1-16
## 3           1-16
## 4           1-16
## 5           1-16
## 6           1-16

tail(allvariables.1nonas)

##             STATEID DISTID VNID ROOMS INCOME       EDU.HH WATER RICE.P
## 39948 Tamil Nadu 33     30   11     2  97400  6th class 6  No 0     26
## 39949 Tamil Nadu 33     30   11     5  27000  7th class 7  No 0     26
## 39951 Tamil Nadu 33     30   11     3 110900 Secondary 10  No 0     18
## 39952 Tamil Nadu 33     30   11     2  15000  5th class 5  No 0     26
## 39953 Tamil Nadu 33     30   11     1  69040  7th class 7  No 0     23
## 39954 Tamil Nadu 33     30   11     2  37000  6th class 6  No 0     23
##       ELEC.ACCESS ELEC.HRS   NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39948       Yes 1        6  No 0     18       3       2       1       0
## 39949       Yes 1        6 Yes 1     26       1       1       0       0
## 39951       Yes 1        6  No 0     17       1       2       1       0
## 39952       Yes 1        6 Yes 1     19       2       1       1       1
## 39953       Yes 1        8  No 0     15       1       3       0       1
## 39954       Yes 1        6 Yes 1     18       1       1       0       1
##       NTEENM NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4
## 39948      1      0   urban 1      5      1     1       1-16       6-12
## 39949      0      0   urban 1      2      0     0       1-16       6-12
## 39951      0      2   urban 1      3      1     2       1-16       6-12
## 39952      0      0   urban 1      3      2     0       1-16       6-12
## 39953      0      0   urban 1      4      1     0       1-16       6-12
## 39954      1      1   urban 1      2      1     2       1-16       6-12
##           filter_. ELEC.HRS.3.NEW
## 39948 Not Selected           1-16
## 39949 Not Selected           1-16
## 39951 Not Selected           1-16
## 39952 Not Selected           1-16
## 39953 Not Selected           1-16
## 39954 Not Selected           1-16

# # Bar Plot of the STATEID faceted by ELEC.HRS.3 
table(allvariables.1nonas$STATEID,allvariables.1nonas$ELEC.HRS.3)

##                        
##                         No Access 1-16 17-24
##   Jammu & Kashmir 01            0  529   154
##   Himachal Pradesh 02           0   63  1405
##   Punjab 03                     0  161  1523
##   Chandigarh 04                 0    0     0
##   Uttarakhand 05                0  234   209
##   Haryana 06                    0 1546   159
##   Delhi 07                      0   88   799
##   Rajasthan 08                  0 1253  1043
##   Uttar Pradesh 09              1 2032   293
##   Bihar 10                      0  855   101
##   Sikkim 11                     0    0     0
##   Arunachal Pradesh 12          0    0     0
##   Nagaland 13                   0    0     0
##   Manipur 14                    0    0     0
##   Mizoram 15                    0    0     0
##   Tripura 16                    0    0     0
##   Meghalaya 17                  0    0     0
##   Assam 18                      0    0     0
##   West Bengal 19                0  375  1564
##   Jharkhand 20                  0  321   408
##   Orissa 21                     0  430  1068
##   Chhattisgarh 22               0  186  1001
##   Madhya Pradesh 23            20 1942   570
##   Gujarat 24                    0   43  1734
##   Daman & Diu 25                0    0     0
##   Dadra+Nagar Haveli 26         0    0     0
##   Maharashtra 27                0 1334  1761
##   Andhra Pradesh 28             0 1125   993
##   Karnataka 29                  0 1847  1699
##   Goa 30                        0   95    91
##   Lakshadweep 31                0    0     0
##   Kerala 32                     0   50  1468
##   Tamil Nadu 33                 1 1423   445
##   Pondicherry 34                0    0     0
##   Anadman/Nicobar 35            0    0     0

library(ggplot2)
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(STATEID)) + 
  geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~ELEC.HRS.3,ncol = 1,scales = "free_y") + scale_y_continuous(breaks = seq(0,2200, by = 200))+ guides(fill = FALSE)

#scales = "free_y" argument gives each facet own y ticks

theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(ELEC.HRS.3)) + geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~STATEID,scales = "free_y") + guides(fill = F)

theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
  geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(~ELEC.HRS.4,ncol = 1, scales = "free_y" ) + theme(axis.text.x = element_text(angle = 90, face = "bold", size = 8)) + guides(fill = FALSE)

# Bar Plot of the EDU.HH faceted by URBAN2011 and ELEC.HRS.4
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
  geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(URBAN2011~ELEC.HRS.4,ncol = 4,scales = "free_y") + theme(axis.text.x = element_text(angle = 90, size = 8)) + guides(fill = FALSE)

#Bar chart with standard errors

library(tidyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(scales)
library(RColorBrewer)


#custom std error function
std_err <- function(x){
sd(x) / sqrt(length(x))
}

library(dplyr)
df_ind_sum <-allvariables.1 %>%
group_by(STATEID) %>%
summarise(Income_med = median(INCOME),Income_se = std_err(INCOME))

kable(df_ind_sum)

STATEID	Income_med	Income_se
Jammu & Kashmir 01	149475	9080.923
Himachal Pradesh 02	104430	7858.078
Punjab 03	116000	7131.363
Uttarakhand 05	81970	6768.412
Haryana 06	98475	10288.974
Delhi 07	170000	10714.613
Rajasthan 08	76200	3296.097
Uttar Pradesh 09	55220	2120.620
Bihar 10	49990	2721.459
West Bengal 19	63000	5453.198
Jharkhand 20	55500	4307.792
Orissa 21	47625	2610.151
Chhattisgarh 22	40386	3682.346
Madhya Pradesh 23	47000	2955.330
Gujarat 24	76800	5924.410
Maharashtra 27	81800	2888.217
Andhra Pradesh 28	62000	2271.968
Karnataka 29	73075	3688.832
Goa 30	103500	8841.920
Kerala 32	134680	4764.551
Tamil Nadu 33	89600	3338.673

table(allvariables.1$STATEID)

## 
##    Jammu & Kashmir 01   Himachal Pradesh 02             Punjab 03 
##                   720                  1476                  1702 
##         Chandigarh 04        Uttarakhand 05            Haryana 06 
##                     0                   468                  1806 
##              Delhi 07          Rajasthan 08      Uttar Pradesh 09 
##                   899                  2707                  3824 
##              Bihar 10             Sikkim 11  Arunachal Pradesh 12 
##                  1547                     0                     0 
##           Nagaland 13            Manipur 14            Mizoram 15 
##                     0                     0                     0 
##            Tripura 16          Meghalaya 17              Assam 18 
##                     0                     0                     0 
##        West Bengal 19          Jharkhand 20             Orissa 21 
##                  2435                   853                  2058 
##       Chhattisgarh 22     Madhya Pradesh 23            Gujarat 24 
##                  1324                  3123                  1895 
##        Daman & Diu 25 Dadra+Nagar Haveli 26        Maharashtra 27 
##                     0                     0                  3309 
##     Andhra Pradesh 28          Karnataka 29                Goa 30 
##                  2203                  3865                   188 
##        Lakshadweep 31             Kerala 32         Tamil Nadu 33 
##                     0                  1570                  1982 
##        Pondicherry 34    Anadman/Nicobar 35 
##                     0                     0

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)

#v.2

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_y_continuous(breaks = seq(0, 200000, by = 25000)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)  # changing the y ticks scale

#v 3

#
# This is the function with various ggplot2 parameters that can be customized
#
ggplot_theme <- function() {
    # Generate the colors for the chart procedurally with RColorBrewer
    palette <- brewer.pal("Greys", n=9)
    color.background = "white" #palette[2]
    color.grid.major = palette[3]
    color.axis.text = palette[6]
    color.axis.title = palette[7]
    color.title = palette[9]
    # Begin construction of chart
    theme_bw(base_size=9) +
        # Set the entire chart region to a light gray color
        theme(panel.background=element_rect(fill=color.background, color=color.background)) +
        theme(plot.background=element_rect(fill=color.background, color=color.background)) +
        theme(panel.border=element_rect(color=color.background)) +
        # Format the grid
        theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
        theme(panel.grid.minor=element_blank()) +
        theme(axis.ticks=element_blank()) +
        # Format the legend, but hide by default
        theme(legend.position="none") +
        theme(legend.background = element_rect(fill=color.background)) +
        theme(legend.text = element_text(size=7,color=color.axis.title)) +
        # Set title and axis labels, and format these and tick marks
        theme(plot.title=element_text(color=color.title, size=12, vjust=1.25)) +
        theme(axis.text.x=element_text(size=7,color=color.axis.text)) +
        theme(axis.text.y=element_text(size=7,color=color.axis.text)) +
        theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0)) +
        theme(axis.title.y=element_text(size=10,color=color.axis.title, vjust=1.25)) +
        # Plot margins
        theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) +  coord_flip()+ggplot_theme()

#reversing the order of the x axis--'states' labels using scale_x_discrete(labels=rev(df_ind_sum$STATEID))

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_x_discrete(labels=rev(df_ind_sum$STATEID)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) +  coord_flip()+ggplot_theme()

###################

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med,fill=STATEID)) + geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)+coord_polar()

#Line chart using mean of Income

df_ind_mean<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_mean = mean(INCOME),TotalAssets=sum(ASSETS))

kable(df_ind_mean)

STATEID	ELEC.HRS.3	Income_mean	TotalAssets
Jammu & Kashmir 01	1-16	213974.15	9986
Jammu & Kashmir 01	17-24	260984.08	3470
Jammu & Kashmir 01	NA	345185.00	31
Himachal Pradesh 02	1-16	129297.44	1061
Himachal Pradesh 02	17-24	177701.44	25650
Himachal Pradesh 02	NA	76081.25	80
Punjab 03	1-16	160272.65	3173
Punjab 03	17-24	202844.17	32231
Punjab 03	NA	70748.76	147
Uttarakhand 05	1-16	112795.15	3830
Uttarakhand 05	17-24	164542.92	3913
Uttarakhand 05	NA	38540.83	176
Haryana 06	1-16	169019.06	NA
Haryana 06	17-24	305111.98	NA
Haryana 06	NA	122365.95	859
Delhi 07	1-16	172350.00	1920
Delhi 07	17-24	265812.55	18113
Delhi 07	NA	230660.00	58
Rajasthan 08	1-16	112094.43	17438
Rajasthan 08	17-24	164266.39	NA
Rajasthan 08	NA	70739.98	2951
Uttar Pradesh 09	No Access	132460.00	12
Uttar Pradesh 09	1-16	114182.21	31337
Uttar Pradesh 09	17-24	147426.82	5537
Uttar Pradesh 09	NA	50347.38	10809
Bihar 10	1-16	97682.27	10970
Bihar 10	17-24	139167.57	1853
Bihar 10	NA	44209.16	NA
West Bengal 19	1-16	130822.35	5341
West Bengal 19	17-24	131854.48	24971
West Bengal 19	NA	54718.13	3343
Jharkhand 20	1-16	91288.91	4605
Jharkhand 20	17-24	122659.47	6487
Jharkhand 20	NA	41349.73	681
Orissa 21	1-16	69659.57	5409
Orissa 21	17-24	110416.24	16198
Orissa 21	NA	42482.44	2840
Chhattisgarh 22	1-16	62157.20	1935
Chhattisgarh 22	17-24	96740.57	14100
Chhattisgarh 22	NA	28314.21	649
Madhya Pradesh 23	No Access	22116.30	104
Madhya Pradesh 23	1-16	80403.43	NA
Madhya Pradesh 23	17-24	132155.99	10287
Madhya Pradesh 23	NA	38475.82	3287
Gujarat 24	1-16	50916.48	509
Gujarat 24	17-24	143961.75	NA
Gujarat 24	NA	67489.78	577
Maharashtra 27	1-16	125510.31	20917
Maharashtra 27	17-24	147213.11	32273
Maharashtra 27	NA	59661.96	1390
Andhra Pradesh 28	1-16	77522.75	NA
Andhra Pradesh 28	17-24	98467.15	NA
Andhra Pradesh 28	NA	41195.14	NA
Karnataka 29	1-16	105682.90	NA
Karnataka 29	17-24	148221.97	NA
Karnataka 29	NA	71562.49	NA
Goa 30	1-16	133228.14	2106
Goa 30	17-24	139525.05	1923
Goa 30	NA	50000.00	23
Kerala 32	1-16	141827.86	984
Kerala 32	17-24	174356.54	NA
Kerala 32	NA	212943.95	607
Tamil Nadu 33	No Access	59500.00	16
Tamil Nadu 33	1-16	124014.88	26321
Tamil Nadu 33	17-24	132274.07	8761
Tamil Nadu 33	NA	71132.70	489

str(df_ind_mean)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  66 obs. of  4 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
##  $ Income_mean: num  213974 260984 345185 129297 177701 ...
##  $ TotalAssets: num  9986 3470 31 1061 25650 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE

#lets see the nos of NAs
apply(df_ind_mean, 2, function(x)sum(is.na(x)))

##     STATEID  ELEC.HRS.3 Income_mean TotalAssets 
##           0          21           0          13

#lets remove NAs from the rows that have NAs

df<-which(apply(is.na(df_ind_mean),1,sum)>0)
df # row nos that have NAs

##  [1]  3  6  9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66

length(df) #no of rows to be deleted

## [1] 31

df_ind_mean<-df_ind_mean[-df,] #removing NAs
str(df_ind_mean)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  35 obs. of  4 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ Income_mean: num  213974 260984 129297 177701 160273 ...
##  $ TotalAssets: num  9986 3470 1061 25650 3173 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
##  - attr(*, "indices")=List of 18
##   ..$ : int  0 1
##   ..$ : int  2 3
##   ..$ : int  4 5
##   ..$ : int  6 7
##   ..$ : int  8 9
##   ..$ : int 10
##   ..$ : int  11 12 13
##   ..$ : int  14 15
##   ..$ : int  16 17
##   ..$ : int  18 19
##   ..$ : int  20 21
##   ..$ : int  22 23
##   ..$ : int  24 25
##   ..$ : int 26
##   ..$ : int  27 28
##   ..$ : int  29 30
##   ..$ : int 31
##   ..$ : int  32 33 34
##  - attr(*, "group_sizes")= int  2 2 2 2 2 1 3 2 2 2 ...
##  - attr(*, "biggest_group_size")= int 3
##  - attr(*, "labels")='data.frame':   18 obs. of  1 variable:
##   ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
##   ..- attr(*, "vars")=List of 1
##   .. ..$ : symbol STATEID
##   ..- attr(*, "drop")= logi TRUE

#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_mean, 2, function(x)sum(is.na(x)))

##     STATEID  ELEC.HRS.3 Income_mean TotalAssets 
##           0           0           0           0

theme_set(theme_bw())
ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean)) +
  geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

#####################

ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

# from the plot above we see very few states where households falling in NO access category. Lets recheck the nos again

table(allvariables.1$ELEC.HRS.3)

## 
## No Access      1-16     17-24 
##        22     16091     18644

df_ind_mean %>% 
  ggplot(aes(x=TotalAssets)) +
  geom_line(aes(y=Income_mean, color=ELEC.HRS.3))

ggplot(df_ind_mean , aes(x = TotalAssets, y = Income_mean)) +
  geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3)+ scale_y_continuous(labels = comma)

############################################
#Line chart using median Income

df_ind_median<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_median = median(INCOME),TotalAssets=sum(ASSETS))

kable(df_ind_median)

STATEID	ELEC.HRS.3	Income_median	TotalAssets
Jammu & Kashmir 01	1-16	141000.0	9986
Jammu & Kashmir 01	17-24	174601.0	3470
Jammu & Kashmir 01	NA	345185.0	31
Himachal Pradesh 02	1-16	86550.0	1061
Himachal Pradesh 02	17-24	105719.0	25650
Himachal Pradesh 02	NA	70820.0	80
Punjab 03	1-16	97160.0	3173
Punjab 03	17-24	120000.0	32231
Punjab 03	NA	64100.0	147
Uttarakhand 05	1-16	72000.0	3830
Uttarakhand 05	17-24	108830.0	3913
Uttarakhand 05	NA	32490.0	176
Haryana 06	1-16	96412.5	NA
Haryana 06	17-24	150400.0	NA
Haryana 06	NA	62800.0	859
Delhi 07	1-16	105000.0	1920
Delhi 07	17-24	180000.0	18113
Delhi 07	NA	84700.0	58
Rajasthan 08	1-16	68982.5	17438
Rajasthan 08	17-24	100000.0	NA
Rajasthan 08	NA	50695.0	2951
Uttar Pradesh 09	No Access	132460.0	12
Uttar Pradesh 09	1-16	70200.0	31337
Uttar Pradesh 09	17-24	87850.0	5537
Uttar Pradesh 09	NA	38250.0	10809
Bihar 10	1-16	61500.0	10970
Bihar 10	17-24	87775.0	1853
Bihar 10	NA	36420.0	NA
West Bengal 19	1-16	58740.0	5341
West Bengal 19	17-24	78925.0	24971
West Bengal 19	NA	39850.0	3343
Jharkhand 20	1-16	51500.0	4605
Jharkhand 20	17-24	71330.0	6487
Jharkhand 20	NA	33892.5	681
Orissa 21	1-16	47600.0	5409
Orissa 21	17-24	63000.0	16198
Orissa 21	NA	32242.5	2840
Chhattisgarh 22	1-16	30002.5	1935
Chhattisgarh 22	17-24	47730.0	14100
Chhattisgarh 22	NA	24250.0	649
Madhya Pradesh 23	No Access	18965.0	104
Madhya Pradesh 23	1-16	45707.5	NA
Madhya Pradesh 23	17-24	88800.0	10287
Madhya Pradesh 23	NA	29395.0	3287
Gujarat 24	1-16	37090.0	509
Gujarat 24	17-24	80500.0	NA
Gujarat 24	NA	46600.0	577
Maharashtra 27	1-16	77007.5	20917
Maharashtra 27	17-24	96000.0	32273
Maharashtra 27	NA	53195.0	1390
Andhra Pradesh 28	1-16	58150.0	NA
Andhra Pradesh 28	17-24	72330.0	NA
Andhra Pradesh 28	NA	34625.0	NA
Karnataka 29	1-16	69520.0	NA
Karnataka 29	17-24	85190.0	NA
Karnataka 29	NA	53400.0	NA
Goa 30	1-16	109500.0	2106
Goa 30	17-24	99000.0	1923
Goa 30	NA	50000.0	23
Kerala 32	1-16	118015.0	984
Kerala 32	17-24	135600.0	NA
Kerala 32	NA	112000.0	607
Tamil Nadu 33	No Access	59500.0	16
Tamil Nadu 33	1-16	90000.0	26321
Tamil Nadu 33	17-24	91000.0	8761
Tamil Nadu 33	NA	52435.0	489

str(df_ind_median)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  66 obs. of  4 variables:
##  $ STATEID      : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
##  $ ELEC.HRS.3   : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
##  $ Income_median: num  141000 174601 345185 86550 105719 ...
##  $ TotalAssets  : num  9986 3470 31 1061 25650 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE

#lets see the nos of NAs
apply(df_ind_median, 2, function(x)sum(is.na(x)))

##       STATEID    ELEC.HRS.3 Income_median   TotalAssets 
##             0            21             0            13

#lets remove NAs from the rows that have NAs

df<-which(apply(is.na(df_ind_median),1,sum)>0)
df # row nos that have NAs

##  [1]  3  6  9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66

length(df) #no of rows to be deleted

## [1] 31

df_ind_median<-df_ind_median[-df,] #removing NAs
str(df_ind_median)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  35 obs. of  4 variables:
##  $ STATEID      : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
##  $ ELEC.HRS.3   : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ Income_median: num  141000 174601 86550 105719 97160 ...
##  $ TotalAssets  : num  9986 3470 1061 25650 3173 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
##  - attr(*, "indices")=List of 18
##   ..$ : int  0 1
##   ..$ : int  2 3
##   ..$ : int  4 5
##   ..$ : int  6 7
##   ..$ : int  8 9
##   ..$ : int 10
##   ..$ : int  11 12 13
##   ..$ : int  14 15
##   ..$ : int  16 17
##   ..$ : int  18 19
##   ..$ : int  20 21
##   ..$ : int  22 23
##   ..$ : int  24 25
##   ..$ : int 26
##   ..$ : int  27 28
##   ..$ : int  29 30
##   ..$ : int 31
##   ..$ : int  32 33 34
##  - attr(*, "group_sizes")= int  2 2 2 2 2 1 3 2 2 2 ...
##  - attr(*, "biggest_group_size")= int 3
##  - attr(*, "labels")='data.frame':   18 obs. of  1 variable:
##   ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
##   ..- attr(*, "vars")=List of 1
##   .. ..$ : symbol STATEID
##   ..- attr(*, "drop")= logi TRUE

#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_median, 2, function(x)sum(is.na(x)))

##       STATEID    ELEC.HRS.3 Income_median   TotalAssets 
##             0             0             0             0

theme_set(theme_bw())
ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median)) +
  geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

df_ind_median %>% 
  ggplot(aes(x=TotalAssets)) +
  geom_line(aes(y=Income_median, color=ELEC.HRS.3))

ggplot(df_ind_median, aes(x = TotalAssets, y = Income_median)) +
  geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3) + scale_y_continuous(labels = comma)

# Boxplots

#lets subset relevant columns
allvariables.1new<-allvariables.1 %>% select(c(STATEID,INCOME,NFE,ASSETS,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1new)

##              STATEID  INCOME  NFE ASSETS ELEC.HRS.3 ELEC.HRS.4
## 1 Jammu & Kashmir 01  176100 No 0     20       1-16      12-18
## 2 Jammu & Kashmir 01 1039150 No 0     24       1-16       6-12
## 3 Jammu & Kashmir 01  182340 No 0     22       1-16       6-12
## 4 Jammu & Kashmir 01   90760 No 0     16       1-16       6-12
## 5 Jammu & Kashmir 01  212600 No 0     17       1-16       6-12
## 6 Jammu & Kashmir 01  152100 No 0     12       1-16      12-18

#lets see the nos of NAs
apply(allvariables.1new, 2, function(x)sum(is.na(x)))

##    STATEID     INCOME        NFE     ASSETS ELEC.HRS.3 ELEC.HRS.4 
##          0          0          0         18       5197       5197

df1<-which(apply(is.na(allvariables.1new),1,sum)>0)
#df1 # row nos that have NAs
length(df1) #no of rows to be deleted

## [1] 5209

df_new<-allvariables.1new[-df1,] #removing NAs
str(df_new)

## 'data.frame':    34745 obs. of  6 variables:
##  $ STATEID   : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME    : num  176100 1039150 182340 90760 212600 ...
##  $ NFE       : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS    : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ ELEC.HRS.3: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4: Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...

#lets see the nos of NAs
apply(df_new, 2, function(x)sum(is.na(x)))

##    STATEID     INCOME        NFE     ASSETS ELEC.HRS.3 ELEC.HRS.4 
##          0          0          0          0          0          0

ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.4))

ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.3))

p1 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_boxplot(aes(fill = ELEC.HRS.3), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot")  # boxplot
p2 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_violin(aes(fill = ELEC.HRS.3), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)")  # violin plot
gridExtra::grid.arrange(p1, p2, ncol=2)

p3 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_boxplot(aes(fill = NFE), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot")  # boxplot
p4 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_violin(aes(fill = NFE), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)")  # violin plot
gridExtra::grid.arrange(p3, p4, ncol=2)

library(extracat)
rmb(formula = ~ NFE+ ELEC.HRS.3+ ELEC.HRS.4, data = df_new)

barplot(table(df_new$NFE,log(df_new$INCOME)),legend.text = TRUE,main = "Log(Income) by NFE",xlab = "Log(Income)")

barplot(table(df_new$NFE,df_new$ASSETS),legend.text = TRUE,main = "Assets by NFE",xlab = "Assets")

ggplot(df_new)+
    geom_bar( aes(x = ASSETS , fill = NFE))+
    theme_bw()+
    theme(panel.border = element_blank(),
          axis.line = element_line(color = "black"))+
    ggtitle("Assets by NFE")+
    scale_colour_manual(name = "NFE", values = c("#11c2d7", "#9f0303"))

barplot(table(df_new$ELEC.HRS.3,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.3",xlab = "Assets")

barplot(table(df_new$ELEC.HRS.4,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.4",xlab = "Assets")

ggplot(df_new)+
    geom_bar( aes(x = ASSETS , fill = ELEC.HRS.4))+
    theme_bw()+
    theme(panel.border = element_blank(),
          axis.line = element_line(color = "black"))+ ggtitle("Assets by ELEC.HRS.4")

barplot(table(df_new$NFE,df_new$ELEC.HRS.4),legend.text = TRUE,main = "ELEC.HRS.4 by NFE",xlab = "ELEC.HRS.4 ")

plot(df_new$ELEC.HRS.4)

mosaicplot(table(df_new$NFE,df_new$ELEC.HRS.4),xlab = "NFE",ylab="ELEC.HRS.4")

df_new %>%
  ggplot(aes(x = ASSETS, y = log(INCOME))) +
   geom_boxplot(aes(group=ASSETS)) +
  geom_point(aes(color=ELEC.HRS.3)) +
  geom_smooth(method="lm", aes(color=ELEC.HRS.3)) +
  labs(title = "Assets vs. log(INCOME) grouped by ELEC.HRS.3 ") + 
  theme(plot.title=element_text(size=20))

ggplot(allvariables.1nonas, 
       aes(x = ASSETS, y = ROOMS , color = ELEC.HRS.3)) +
  geom_line() +
  ggtitle("Assets vs. Rooms grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas, 
       aes(x = RICE.P, y = ROOMS , color = ELEC.HRS.3)) +
  geom_line() +
  ggtitle("RICE.P vs. ROOMS grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas, 
       aes(x = RICE.P, y = ROOMS , color =NFE)) +
  geom_line() +
  ggtitle("RICE.P vs. ROOMS grouped by NFE")

#############################

library(beanplot)
library(RColorBrewer)

bean.cols <- lapply(brewer.pal(6, "Set3"),
function(x) {return(c(x, "black", "black", "black"))})

beanplot(ASSETS ~ ELEC.HRS.4,
data = df_new,
main = "Relationship between ELEC.HRS.4 and ASSETS",
xlab = "ELEC.HRS.4",
ylab = "ASSETS",
col = bean.cols ,lwd = 1,what = c(1, 1, 1, 1))

# In one panel

#lets subset relevant columns
allvariables.1box<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1box)

##              STATEID  INCOME ELEC.ACCESS  NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01  176100       Yes 1 No 0   rural 0       1-16
## 2 Jammu & Kashmir 01 1039150       Yes 1 No 0   rural 0       1-16
## 3 Jammu & Kashmir 01  182340       Yes 1 No 0   rural 0       1-16
## 4 Jammu & Kashmir 01   90760       Yes 1 No 0   rural 0       1-16
## 5 Jammu & Kashmir 01  212600       Yes 1 No 0   rural 0       1-16
## 6 Jammu & Kashmir 01  152100       Yes 1 No 0   rural 0       1-16
##   ELEC.HRS.4
## 1      12-18
## 2       6-12
## 3       6-12
## 4       6-12
## 5       6-12
## 6      12-18

#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0         144           0           0        5197 
##  ELEC.HRS.4 
##        5197

df2<-which(apply(is.na(allvariables.1box[,c(1,2,3,4)]),1,sum)>0)
#df1 # row nos that have NAs
length(df2) #no of rows to be deleted

## [1] 144

df_g1<-allvariables.1box[-df2,] #removing NAs
str(df_g1)

## 'data.frame':    39810 obs. of  7 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...

#lets see the nos of NAs
apply(df_g1, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0           0           0           0        5055 
##  ELEC.HRS.4 
##        5055

g1<-ggplot(data = df_g1, aes(x = ELEC.ACCESS, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)

g2<-ggplot(data = df_g1, aes(x = NFE, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g3<-ggplot(data =df_g1, aes(x = URBAN2011, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

#lets subset relevant columns
allvariables.1box2<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1box)

##              STATEID  INCOME ELEC.ACCESS  NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01  176100       Yes 1 No 0   rural 0       1-16
## 2 Jammu & Kashmir 01 1039150       Yes 1 No 0   rural 0       1-16
## 3 Jammu & Kashmir 01  182340       Yes 1 No 0   rural 0       1-16
## 4 Jammu & Kashmir 01   90760       Yes 1 No 0   rural 0       1-16
## 5 Jammu & Kashmir 01  212600       Yes 1 No 0   rural 0       1-16
## 6 Jammu & Kashmir 01  152100       Yes 1 No 0   rural 0       1-16
##   ELEC.HRS.4
## 1      12-18
## 2       6-12
## 3       6-12
## 4       6-12
## 5       6-12
## 6      12-18

#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0         144           0           0        5197 
##  ELEC.HRS.4 
##        5197

df3<-which(apply(is.na(allvariables.1box[,c(1,2,4,5,6,7)]),1,sum)>0)
#df1 # row nos that have NAs
length(df3) #no of rows to be deleted

## [1] 5197

df_g4<-allvariables.1box[-df3,] #removing NAs
str(df_g4)

## 'data.frame':    34757 obs. of  7 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...

#lets see the nos of NAs
apply(df_g4, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0           2           0           0           0 
##  ELEC.HRS.4 
##           0

g4<-ggplot(data = df_g4, aes(x = ELEC.HRS.3, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g5<-ggplot(data = df_g4, aes(x = ELEC.HRS.4, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g6<-ggplot(data = df_g4, aes(x = STATEID, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) + theme(axis.text= element_text(size=6, angle=90))

library(gridExtra)
grid.arrange(g1,g2,g3,g4,g5,g6,ncol=2)

#Scatterplots

#lets subset relevant columns
allvariables.1scat<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,ELEC.HRS,NFE,ASSETS,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1scat)

##              STATEID  INCOME ELEC.ACCESS ELEC.HRS  NFE ASSETS URBAN2011
## 1 Jammu & Kashmir 01  176100       Yes 1       12 No 0     20   rural 0
## 2 Jammu & Kashmir 01 1039150       Yes 1        8 No 0     24   rural 0
## 3 Jammu & Kashmir 01  182340       Yes 1        8 No 0     22   rural 0
## 4 Jammu & Kashmir 01   90760       Yes 1        8 No 0     16   rural 0
## 5 Jammu & Kashmir 01  212600       Yes 1        8 No 0     17   rural 0
## 6 Jammu & Kashmir 01  152100       Yes 1       14 No 0     12   rural 0
##   ELEC.HRS.3 ELEC.HRS.4
## 1       1-16      12-18
## 2       1-16       6-12
## 3       1-16       6-12
## 4       1-16       6-12
## 5       1-16       6-12
## 6       1-16      12-18

#lets see the nos of NAs
apply(allvariables.1scat, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0         144        5197           0          18 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0        5197        5197

df4<-which(apply(is.na(allvariables.1scat[,c(1,2,3)]),1,sum)>0)
#df1 # row nos that have NAs
length(df4) #no of rows to be deleted

## [1] 144

df_g5<-allvariables.1scat[-df4,] #removing NAs
str(df_g5)

## 'data.frame':    39810 obs. of  9 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS   : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS     : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...

#lets see the nos of NAs
apply(df_g5, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0           0        5055           0          12 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0        5055        5055

table(df_g5$ELEC.HRS)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##   22   70  264  467  906 1245 2125  783 2653  713 2123  152 2487  246  583 
##   15   16   17   18   19   20   21   22   23   24 
## 1272 1313  351 2535  330 3882  624 3273 2155 4181

table(df_g5$NFE,df_g5$ELEC.ACCESS,df_g5$ELEC.HRS.3)

## , ,  = No Access
## 
##        
##          No 0 Yes 1
##   No 0      0    20
##   Yes 1     0     2
## 
## , ,  = 1-16
## 
##        
##          No 0 Yes 1
##   No 0      0 12735
##   Yes 1     0  3354
## 
## , ,  = 17-24
## 
##        
##          No 0 Yes 1
##   No 0      0 14252
##   Yes 1     0  4392

ggplot(df_g5,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

ggplot(df_g5,aes(NFE,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("NFE and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

str(allvariables.1nonas)

## 'data.frame':    34442 obs. of  26 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
##   .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...

ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4 , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.3 , ncol=3) + ggtitle("Assets and Income grouped by ELEC.HRS.3") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~NFE , ncol=2) + ggtitle("Elect Hours and Income grouped by NFE") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4  , ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~URBAN2011 + ELEC.HRS.4, ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4 & URBAN2011") + scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})

ggplot(df_g5, aes(x=ELEC.ACCESS,y=INCOME))+geom_point(aes(colour=STATEID))+ facet_wrap(~STATEID)+scale_y_continuous(labels = comma)

ggplot(df_g5, aes(x=ASSETS,y=log(INCOME)))+geom_point(aes(colour=STATEID)) + facet_wrap(~STATEID)+stat_smooth()

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4)) + theme_grey()

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4), position="dodge")

ggplot(df_g4)+geom_density(aes(x=log(INCOME), colour=ELEC.HRS.4))

ggplot(df_g4)+geom_density(aes(x=log(INCOME), fill=ELEC.HRS.4))

###################################

#extract only numeric columns from data frame 
library(dplyr)
new_df <- allvariables.1[sapply(allvariables.1,is.numeric)]
new_df<-na.omit(new_df)
dim(new_df)

## [1] 34719    16

str(new_df)

## 'data.frame':    34719 obs. of  16 variables:
##  $ DISTID  : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS   : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME  : num  176100 1039150 182340 90760 212600 ...
##  $ RICE.P  : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.HRS: num  12 8 8 8 8 14 3 22 22 8 ...
##  $ ASSETS  : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM  : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF  : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ NADULT  : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD  : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN   : num  1 2 2 1 2 0 4 1 3 0 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5235] 309 500 799 1187 1352 1572 1622 1955 1967 2161 ...
##   .. ..- attr(*, "names")= chr [1:5235] "309" "500" "799" "1187" ...

#Interactive Data Tables
library(DT)
datatable(new_df, options = list(pageLength = 5))

library(knitr)
kable(head(new_df))

DISTID	VNID	ROOMS	INCOME	RICE.P	ELEC.HRS	ASSETS	NADULTM	NADULTF	NCHILDM	NCHILDF	NTEENM	NTEENF	NADULT	NCHILD	NTEEN
2	1	12	176100	17	12	20	2	2	2	2	1	0	4	4	1
2	1	10	1039150	20	8	24	5	3	3	3	1	1	8	6	2
2	1	3	182340	15	8	22	1	2	1	1	1	1	3	2	2
2	1	4	90760	20	8	16	1	1	1	1	0	1	2	2	1
2	1	10	212600	20	8	17	3	3	1	1	2	0	6	2	2
2	1	5	152100	12	14	12	4	3	2	2	0	0	7	4	0

colnos<-which(apply(is.na(new_df),2,sum)>0)
colnos

## named integer(0)

sapply(new_df,class)

##    DISTID      VNID     ROOMS    INCOME    RICE.P  ELEC.HRS    ASSETS 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##   NADULTM   NADULTF   NCHILDM   NCHILDF    NTEENM    NTEENF    NADULT 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##    NCHILD     NTEEN 
## "numeric" "numeric"

sapply(new_df,mean)

##       DISTID         VNID        ROOMS       INCOME       RICE.P 
## 1.459187e+01 5.799677e+00 2.792678e+00 1.361545e+05 2.260492e+01 
##     ELEC.HRS       ASSETS      NADULTM      NADULTF      NCHILDM 
## 1.532054e+01 1.668634e+01 1.463925e+00 1.528903e+00 6.913794e-01 
##      NCHILDF       NTEENM       NTEENF       NADULT       NCHILD 
## 6.300873e-01 2.871338e-01 2.926928e-01 2.992828e+00 1.321467e+00 
##        NTEEN 
## 5.798266e-01

#Dot plots

df6<-which(apply(is.na(allvariables.1scat[,c(1:9)]),1,sum)>0)
#df1 # row nos that have NAs
length(df6) #no of rows to be deleted

## [1] 5211

df_g7<-allvariables.1scat[-df6,] #removing NAs
str(df_g7)

## 'data.frame':    34743 obs. of  9 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS   : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS     : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...

#lets see the nos of NAs
apply(df_g7, 2, function(x)sum(is.na(x)))

##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0           0           0           0           0 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0           0           0

ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.3) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.4) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

#lets see the distribution of var Income

# Histogram after some cleaning(remove very high values, seem like errors)
ggplot(df_g5, aes(INCOME)) + geom_histogram(color = "white",bins = 40) + theme(axis.text.x = element_text(angle = 90, hjust = 1,size = 8)) + scale_x_continuous(breaks = seq(-100000, 15000000, 1000000),labels = comma)

ggplot(df_g5)+geom_histogram(aes(x=INCOME),fill="darkgreen")+ theme_grey()

#lets see how distribution changes across differnt factor levels

library(mosaic)
histogram(~ INCOME | ELEC.ACCESS, layout=c(1, 2), data=df_g5,main="Income by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | ELEC.ACCESS, layout=c(1, 2), data=df_g5 ,main="Assets by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | URBAN2011, layout=c(1, 2), data=df_g5 ,main="Assets by URBAN2011",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.3, data=df_g5 ,main="Assets by ELEC.HRS.3",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.4, data=df_g5 ,main="Assets by ELEC.HRS.4",col="darkgreen")

histogram(~ ASSETS | NFE, data=df_g5 ,main="Assets by NFE",col="darkgreen")

histogram(~ ASSETS | STATEID, data=df_g5 ,main="Assets by STATEID",col="darkgreen")

histogram(~ INCOME | STATEID, data=df_g5 ,main="INCOME by STATEID",col="darkgreen")

#using ggplot2

# Faceted by ELEC.HRS.4
ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  facet_grid(~ELEC.HRS.4) +
  guides(fill = FALSE)

# Faceted by ELEC.HRS.4 and URBAN2011
ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") +
  guides(fill = FALSE)

ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 270)) + 
  facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") +   scale_x_continuous(breaks = seq(0, 40, 4)) + guides(fill = FALSE)

# using arg angle = 270 instead of angle = 90

#############################

library(Hmisc)
tmp <- new_df$INCOME
qqnorm(tmp)
qqline(tmp)
tmp2 <- subplot( hist(tmp,xlab='',ylab='',main=''),
                 cnvrt.coords(0.1,0.9,'plt')$usr, vadj=1, hadj=0 )
op <- par(no.readonly=TRUE)
par(tmp2)
abline(v=0, col='green')

#various ways to summarise

summary(new_df)

##      DISTID           VNID          ROOMS            INCOME        
##  Min.   : 1.00   Min.   : 1.0   Min.   : 1.000   Min.   :-1037040  
##  1st Qu.: 6.00   1st Qu.: 2.0   1st Qu.: 2.000   1st Qu.:   43500  
##  Median :12.00   Median : 4.0   Median : 2.000   Median :   80590  
##  Mean   :14.59   Mean   : 5.8   Mean   : 2.793   Mean   :  136154  
##  3rd Qu.:20.00   3rd Qu.: 7.0   3rd Qu.: 4.000   3rd Qu.:  154222  
##  Max.   :68.00   Max.   :39.0   Max.   :50.000   Max.   :11360000  
##      RICE.P         ELEC.HRS         ASSETS         NADULTM     
##  Min.   :  0.0   Min.   : 0.00   Min.   : 1.00   Min.   :0.000  
##  1st Qu.: 18.0   1st Qu.: 9.00   1st Qu.:12.00   1st Qu.:1.000  
##  Median : 22.0   Median :16.00   Median :17.00   Median :1.000  
##  Mean   : 22.6   Mean   :15.32   Mean   :16.69   Mean   :1.464  
##  3rd Qu.: 27.0   3rd Qu.:22.00   3rd Qu.:21.00   3rd Qu.:2.000  
##  Max.   :120.0   Max.   :24.00   Max.   :33.00   Max.   :9.000  
##     NADULTF         NCHILDM          NCHILDF            NTEENM      
##  Min.   :0.000   Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median : 0.0000   Median :0.0000  
##  Mean   :1.529   Mean   :0.6914   Mean   : 0.6301   Mean   :0.2871  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.:0.0000  
##  Max.   :9.000   Max.   :8.0000   Max.   :10.0000   Max.   :5.0000  
##      NTEENF           NADULT           NCHILD           NTEEN       
##  Min.   :0.0000   Min.   : 0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0.0000   Median : 3.000   Median : 1.000   Median :0.0000  
##  Mean   :0.2927   Mean   : 2.993   Mean   : 1.321   Mean   :0.5798  
##  3rd Qu.:0.0000   3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.:1.0000  
##  Max.   :5.0000   Max.   :18.000   Max.   :18.000   Max.   :7.0000

library(Hmisc)
describe(new_df)

## new_df 
## 
##  16  Variables      34719  Observations
## ---------------------------------------------------------------------------
## DISTID 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      60       1   14.59       2       3       6      12 
##     .75     .90     .95 
##      20      29      34 
## 
## lowest :  1  2  3  4  5, highest: 63 65 66 67 68 
## ---------------------------------------------------------------------------
## VNID 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      39    0.99     5.8       1       1       2       4 
##     .75     .90     .95 
##       7      11      15 
## 
## lowest :  1  2  3  4  5, highest: 35 36 37 38 39 
## ---------------------------------------------------------------------------
## ROOMS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      25    0.94   2.793       1       1       2       2 
##     .75     .90     .95 
##       4       5       6 
## 
## lowest :  1  2  3  4  5, highest: 23 24 25 26 50 
## ---------------------------------------------------------------------------
## INCOME 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0   13698       1  136154   12000   22000   43500   80590 
##     .75     .90     .95 
##  154223  289606  423000 
## 
## lowest : -1037040  -867025  -245000  -214475  -208138
## highest:  8096550  8322000  9563500 11169820 11360000 
## ---------------------------------------------------------------------------
## RICE.P 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      96    0.99    22.6      10      15      18      22 
##     .75     .90     .95 
##      27      32      35 
## 
## lowest :   0.0   0.5   1.0   2.0   2.5
## highest:  80.0  90.0  95.0 100.0 120.0 
## ---------------------------------------------------------------------------
## ELEC.HRS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      25    0.99   15.32       5       6       9      16 
##     .75     .90     .95 
##      22      24      24 
## 
## lowest :  0  1  2  3  4, highest: 20 21 22 23 24 
## ---------------------------------------------------------------------------
## ASSETS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      33       1   16.69       7       8      12      17 
##     .75     .90     .95 
##      21      24      26 
## 
## lowest :  1  2  3  4  5, highest: 29 30 31 32 33 
## ---------------------------------------------------------------------------
## NADULTM 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      10    0.81   1.464       0       1       1       1 
##     .75     .90     .95 
##       2       3       3 
## 
##              0     1    2    3   4   5  6 7 8 9
## Frequency 2411 19192 8961 3216 716 160 49 8 4 2
## %            7    55   26    9   2   0  0 0 0 0
## ---------------------------------------------------------------------------
## NADULTF 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      10    0.79   1.529       1       1       1       1 
##     .75     .90     .95 
##       2       3       3 
## 
##             0     1     2    3   4   5  6 7 8 9
## Frequency 687 19611 10772 2853 651 117 20 6 1 1
## %           2    56    31    8   2   0  0 0 0 0
## ---------------------------------------------------------------------------
## NCHILDM 
##       n missing  unique    Info    Mean 
##   34719       0       9    0.83  0.6914 
## 
##               0     1    2   3   4  5  6 7 8
## Frequency 18125 10809 4549 943 217 60 13 2 1
## %            52    31   13   3   1  0  0 0 0
## ---------------------------------------------------------------------------
## NCHILDF 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      11    0.78  0.6301       0       0       0       0 
##     .75     .90     .95 
##       1       2       2 
## 
##               0    1    2    3   4  5  6 7 8 9 10
## Frequency 20068 9543 3606 1069 305 90 28 7 1 1  1
## %            58   27   10    3   1  0  0 0 0 0  0
## ---------------------------------------------------------------------------
## NTEENM 
##       n missing  unique    Info    Mean 
##   34719       0       6    0.55  0.2871 
## 
##               0    1    2   3 4 5
## Frequency 26402 6801 1390 117 8 1
## %            76   20    4   0 0 0
## ---------------------------------------------------------------------------
## NTEENF 
##       n missing  unique    Info    Mean 
##   34719       0       6    0.56  0.2927 
## 
##               0    1    2   3  4 5
## Frequency 26340 6803 1394 159 21 2
## %            76   20    4   0  0 0
## ---------------------------------------------------------------------------
## NADULT 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      17    0.91   2.993       1       2       2       3 
##     .75     .90     .95 
##       4       5       6 
## 
##            0    1     2    3    4    5    6   7   8  9 10 11 12 13 14 16
## Frequency 32 2157 14304 8014 5621 2442 1321 452 217 83 40 18  8  6  2  1
## %          0    6    41   23   16    7    4   1   1  0  0  0  0  0  0  0
##           18
## Frequency  1
## %          0
## ---------------------------------------------------------------------------
## NCHILD 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      14    0.92   1.321       0       0       0       1 
##     .75     .90     .95 
##       2       3       4 
## 
##               0    1    2    3    4   5   6  7  8  9 10 11 15 18
## Frequency 13008 7641 7948 3679 1529 565 201 83 34 18 10  1  1  1
## %            37   22   23   11    4   2   1  0  0  0  0  0  0  0
## ---------------------------------------------------------------------------
## NTEEN 
##       n missing  unique    Info    Mean 
##   34719       0       8    0.76  0.5798 
## 
##               0    1    2   3   4  5 6 7
## Frequency 21195 8286 4070 998 145 21 2 2
## %            61   24   12   3   0  0 0 0
## ---------------------------------------------------------------------------

options(scipen = T)
library(fBasics)
kable(basicStats(new_df),digits =2)

	DISTID	VNID	ROOMS	INCOME	RICE.P	ELEC.HRS	ASSETS	NADULTM	NADULTF	NCHILDM	NCHILDF	NTEENM	NTEENF	NADULT	NCHILD	NTEEN
nobs	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00	34719.00
NAs	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00
Minimum	1.00	1.00	1.00	-1037040.00	0.00	0.00	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00
Maximum	68.00	39.00	50.00	11360000.00	120.00	24.00	33.00	9.00	9.00	8.00	10.00	5.00	5.00	18.00	18.00	7.00
1. Quartile	6.00	2.00	2.00	43500.00	18.00	9.00	12.00	1.00	1.00	0.00	0.00	0.00	0.00	2.00	0.00	0.00
3. Quartile	20.00	7.00	4.00	154222.50	27.00	22.00	21.00	2.00	2.00	1.00	1.00	0.00	0.00	4.00	2.00	1.00
Mean	14.59	5.80	2.79	136154.48	22.60	15.32	16.69	1.46	1.53	0.69	0.63	0.29	0.29	2.99	1.32	0.58
Median	12.00	4.00	2.00	80590.00	22.00	16.00	17.00	1.00	1.00	0.00	0.00	0.00	0.00	3.00	1.00	0.00
Sum	506615.00	201359.00	96959.00	4727147456.95	784820.30	531914.00	579333.00	50826.00	53082.00	24004.00	21876.00	9969.00	10162.00	103908.00	45880.00	20131.00
SE Mean	0.06	0.03	0.01	1227.71	0.05	0.04	0.03	0.00	0.00	0.00	0.00	0.00	0.00	0.01	0.01	0.00
LCL Mean	14.47	5.74	2.77	133748.14	22.51	15.25	16.62	1.45	1.52	0.68	0.62	0.28	0.29	2.98	1.31	0.57
UCL Mean	14.71	5.86	2.81	138560.83	22.70	15.39	16.75	1.47	1.54	0.70	0.64	0.29	0.30	3.01	1.34	0.59
Variance	133.65	34.13	2.96	52330646462.37	80.08	46.21	36.27	0.80	0.63	0.76	0.82	0.31	0.32	2.05	1.93	0.72
Stdev	11.56	5.84	1.72	228758.93	8.95	6.80	6.02	0.89	0.79	0.87	0.91	0.56	0.57	1.43	1.39	0.85
Skewness	1.74	2.96	2.63	15.86	0.70	-0.28	-0.07	1.23	1.34	1.37	1.78	1.96	2.06	1.44	1.24	1.46
Kurtosis	4.42	11.09	25.80	549.95	5.82	-1.34	-0.69	2.80	2.79	2.31	4.35	3.74	4.51	3.56	2.73	1.84

library(psych)
kable(describe(new_df),digits = 2)

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
DISTID	1	34719	14.59	11.56	12	13.06	10.38	1	68	67	1.74	4.42	0.06
VNID	2	34719	5.80	5.84	4	4.73	2.97	1	39	38	2.96	11.09	0.03
ROOMS	3	34719	2.79	1.72	2	2.56	1.48	1	50	49	2.63	25.80	0.01
INCOME	4	34719	136154.48	228758.93	80590	99396.43	68125.47	-1037040	11360000	12397040	15.86	549.95	1227.71
RICE.P	5	34719	22.60	8.95	22	22.48	5.93	0	120	120	0.70	5.82	0.05
ELEC.HRS	6	34719	15.32	6.80	16	15.62	8.90	0	24	24	-0.28	-1.34	0.04
ASSETS	7	34719	16.69	6.02	17	16.75	7.41	1	33	32	-0.07	-0.69	0.03
NADULTM	8	34719	1.46	0.89	1	1.37	0.00	0	9	9	1.23	2.80	0.00
NADULTF	9	34719	1.53	0.79	1	1.40	0.00	0	9	9	1.34	2.79	0.00
NCHILDM	10	34719	0.69	0.87	0	0.56	0.00	0	8	8	1.37	2.31	0.00
NCHILDF	11	34719	0.63	0.91	0	0.46	0.00	0	10	10	1.78	4.35	0.00
NTEENM	12	34719	0.29	0.56	0	0.17	0.00	0	5	5	1.96	3.74	0.00
NTEENF	13	34719	0.29	0.57	0	0.18	0.00	0	5	5	2.06	4.51	0.00
NADULT	14	34719	2.99	1.43	3	2.81	1.48	0	18	18	1.44	3.56	0.01
NCHILD	15	34719	1.32	1.39	1	1.13	1.48	0	18	18	1.24	2.73	0.01
NTEEN	16	34719	0.58	0.85	0	0.43	0.00	0	7	7	1.46	1.84	0.00

#scatterplot matrix
library(car)
scatterplotMatrix(new_df[,3:7],diagonal="histogram",smooth=FALSE)

library(gpairs)
gpairs(new_df[,3:7], upper.pars = list(scatter = 'stats'),stat.pars = list(verbose = FALSE))

#density plot
par(mfrow=c(3,3), mar=c(2.5,2,1.5,1.5))
colnames <- dimnames(new_df)[[2]]
for (i in 3:7) {
  d <- density(new_df[,i])
  plot(d, type="n", main=colnames[i])
  polygon(d, col="red", border="gray")
}

par(mfrow=c(1,1))

my Files for reference:

**file:///J:/rstudio%20files/BoulderBCycle.html**

Categorical Data Visualisation project

Prepared by Nishant Upadhyay

16 June, 2016