Source file ⇒ categoricalvisualisationproject.rmd

Exploratory data analysis

REVEAL THYSELF……………

library(foreign)
allvariables.1<-read.spss("all variables.111.sav")
#allvariables.1<-read.csv("all variables.1.csv",header = T)

#converting lists into a data frame only when importing an SPSS file
allvariables.1<-as.data.frame(allvariables.1)
class(allvariables.1)
## [1] "data.frame"
class(allvariables.1)
## [1] "data.frame"
#knowing the  class of each col
sapply(allvariables.1,class)
##        STATEID         DISTID           VNID          ROOMS         INCOME 
##       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
##         EDU.HH          WATER         RICE.P    ELEC.ACCESS       ELEC.HRS 
##       "factor"       "factor"      "numeric"       "factor"      "numeric" 
##            NFE         ASSETS        NADULTM        NADULTF        NCHILDM 
##       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
##        NCHILDF         NTEENM         NTEENF      URBAN2011      NFE.TYPES 
##      "numeric"      "numeric"      "numeric"       "factor"       "factor" 
##         NADULT         NCHILD          NTEEN     ELEC.HRS.3     ELEC.HRS.4 
##      "numeric"      "numeric"      "numeric"       "factor"       "factor" 
##       filter_. ELEC.HRS.3.NEW 
##       "factor"       "factor"
str(allvariables.1)
## 'data.frame':    39954 obs. of  27 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NFE.TYPES     : Factor w/ 71 levels "Agriculture 0",..: 49 NA 65 NA NA NA 37 NA NA 46 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
head(allvariables.1)
##              STATEID DISTID VNID ROOMS  INCOME       EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01      2    1    12  176100       none 0 Yes 1     17
## 2 Jammu & Kashmir 01      2    1    10 1039150       none 0 Yes 1     20
## 3 Jammu & Kashmir 01      2    1     3  182340 Secondary 10 Yes 1     15
## 4 Jammu & Kashmir 01      2    1     4   90760       none 0 Yes 1     20
## 5 Jammu & Kashmir 01      2    1    10  212600       none 0 Yes 1     20
## 6 Jammu & Kashmir 01      2    1     5  152100       none 0 Yes 1     12
##   ELEC.ACCESS ELEC.HRS  NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1       Yes 1       12 No 0     20       2       2       2       2      1
## 2       Yes 1        8 No 0     24       5       3       3       3      1
## 3       Yes 1        8 No 0     22       1       2       1       1      1
## 4       Yes 1        8 No 0     16       1       1       1       1      0
## 5       Yes 1        8 No 0     17       3       3       1       1      2
## 6       Yes 1       14 No 0     12       4       3       2       2      0
##   NTEENF URBAN2011         NFE.TYPES NADULT NCHILD NTEEN ELEC.HRS.3
## 1      0   rural 0 Land transport 70      4      4     1       1-16
## 2      1   rural 0              <NA>      8      6     2       1-16
## 3      1   rural 0        Medical 93      3      2     2       1-16
## 4      1   rural 0              <NA>      2      2     1       1-16
## 5      0   rural 0              <NA>      6      2     2       1-16
## 6      0   rural 0              <NA>      7      4     0       1-16
##   ELEC.HRS.4 filter_. ELEC.HRS.3.NEW
## 1      12-18 Selected           1-16
## 2       6-12 Selected           1-16
## 3       6-12 Selected           1-16
## 4       6-12 Selected           1-16
## 5       6-12 Selected           1-16
## 6      12-18 Selected           1-16
tail(allvariables.1)
##             STATEID DISTID VNID ROOMS INCOME       EDU.HH WATER RICE.P
## 39949 Tamil Nadu 33     30   11     5  27000  7th class 7  No 0     26
## 39950 Tamil Nadu 33     30   11     2   9000       none 0 Yes 1     26
## 39951 Tamil Nadu 33     30   11     3 110900 Secondary 10  No 0     18
## 39952 Tamil Nadu 33     30   11     2  15000  5th class 5  No 0     26
## 39953 Tamil Nadu 33     30   11     1  69040  7th class 7  No 0     23
## 39954 Tamil Nadu 33     30   11     2  37000  6th class 6  No 0     23
##       ELEC.ACCESS ELEC.HRS   NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39949       Yes 1        6 Yes 1     26       1       1       0       0
## 39950        No 0       NA  No 0      8       0       1       0       0
## 39951       Yes 1        6  No 0     17       1       2       1       0
## 39952       Yes 1        6 Yes 1     19       2       1       1       1
## 39953       Yes 1        8  No 0     15       1       3       0       1
## 39954       Yes 1        6 Yes 1     18       1       1       0       1
##       NTEENM NTEENF URBAN2011            NFE.TYPES NADULT NCHILD NTEEN
## 39949      0      0   urban 1  Retail household 67      2      0     0
## 39950      0      0   urban 1     Air transport 72      1      0     0
## 39951      0      2   urban 1   Retail textiles 66      3      1     2
## 39952      0      0   urban 1        Retail nec 68      3      2     0
## 39953      0      0   urban 1 Personal services 96      4      1     0
## 39954      1      1   urban 1       Retail food 65      2      1     2
##       ELEC.HRS.3 ELEC.HRS.4     filter_. ELEC.HRS.3.NEW
## 39949       1-16       6-12 Not Selected           1-16
## 39950       <NA>       <NA> Not Selected           <NA>
## 39951       1-16       6-12 Not Selected           1-16
## 39952       1-16       6-12 Not Selected           1-16
## 39953       1-16       6-12 Not Selected           1-16
## 39954       1-16       6-12 Not Selected           1-16
dim(allvariables.1)
## [1] 39954    27
summary(allvariables.1)  # gives the info about NAs
##               STATEID          DISTID           VNID       
##  Karnataka 29     : 3865   Min.   : 1.00   Min.   : 1.000  
##  Uttar Pradesh 09 : 3824   1st Qu.: 7.00   1st Qu.: 2.000  
##  Maharashtra 27   : 3309   Median :12.00   Median : 4.000  
##  Madhya Pradesh 23: 3123   Mean   :15.43   Mean   : 5.608  
##  Rajasthan 08     : 2707   3rd Qu.:21.00   3rd Qu.: 7.000  
##  West Bengal 19   : 2435   Max.   :68.00   Max.   :39.000  
##  (Other)          :20691                                   
##      ROOMS            INCOME                  EDU.HH        WATER      
##  Min.   : 0.000   Min.   :-1037040   none 0      :24520   No 0 :27201  
##  1st Qu.: 2.000   1st Qu.:   38220   5th class 5 : 3514   Yes 1:12613  
##  Median : 2.000   Median :   72470   Secondary 10: 2057   NA's :  140  
##  Mean   : 2.684   Mean   :  125336   4th class 4 : 1673                
##  3rd Qu.: 3.000   3rd Qu.:  141500   8th class 8 : 1524                
##  Max.   :50.000   Max.   :11360000   (Other)     : 6390                
##  NA's   :172                         NA's        :  276                
##      RICE.P       ELEC.ACCESS      ELEC.HRS        NFE       
##  Min.   :  0.00   No 0 : 4970   Min.   : 0.00   No 0 :31564  
##  1st Qu.: 18.00   Yes 1:34840   1st Qu.: 9.00   Yes 1: 8390  
##  Median : 20.00   NA's :  144   Median :16.00                
##  Mean   : 21.92                 Mean   :15.32                
##  3rd Qu.: 26.00                 3rd Qu.:22.00                
##  Max.   :120.00                 Max.   :24.00                
##                                 NA's   :5197                 
##      ASSETS         NADULTM         NADULTF         NCHILDM       
##  Min.   : 0.00   Min.   :0.000   Min.   :0.000   Min.   : 0.0000  
##  1st Qu.:10.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 0.0000  
##  Median :16.00   Median :1.000   Median :1.000   Median : 0.0000  
##  Mean   :15.39   Mean   :1.424   Mean   :1.492   Mean   : 0.7157  
##  3rd Qu.:21.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.: 1.0000  
##  Max.   :33.00   Max.   :9.000   Max.   :9.000   Max.   :10.0000  
##  NA's   :18                                                       
##     NCHILDF            NTEENM           NTEENF         URBAN2011    
##  Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000   rural 0:26134  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000   urban 1:13820  
##  Median : 0.0000   Median :0.0000   Median :0.0000                  
##  Mean   : 0.6566   Mean   :0.2852   Mean   :0.2926                  
##  3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000                  
##  Max.   :10.0000   Max.   :5.0000   Max.   :5.0000                  
##                                                                     
##              NFE.TYPES         NADULT           NCHILD      
##  Agriculture 0    : 7570   Min.   : 0.000   Min.   : 0.000  
##  Construction 50  : 3190   1st Qu.: 2.000   1st Qu.: 0.000  
##  Public admin 90  : 1565   Median : 2.000   Median : 1.000  
##  Retail food 65   : 1230   Mean   : 2.917   Mean   : 1.372  
##  Land transport 70:  954   3rd Qu.: 4.000   3rd Qu.: 2.000  
##  (Other)          : 9042   Max.   :18.000   Max.   :18.000  
##  NA's             :16403                                    
##      NTEEN            ELEC.HRS.3    ELEC.HRS.4            filter_.    
##  Min.   :0.0000   No Access:   22   0-6  : 2975   Not Selected:36056  
##  1st Qu.:0.0000   1-16     :16091   6-12 : 8550   Selected    : 3898  
##  Median :0.0000   17-24    :18644   12-18: 6252                       
##  Mean   :0.5778   NA's     : 5197   18-24:16980                       
##  3rd Qu.:1.0000                     NA's : 5197                       
##  Max.   :7.0000                                                       
##                                                                       
##    ELEC.HRS.3.NEW 
##  No Access:   92  
##  1-16     :17334  
##  16-24    :17331  
##  NA's     : 5197  
##                   
##                   
## 
#utils::View(allvariables.1) Shows the entire data set

# allvariables.1$STATEID<-as.factor(as.character(allvariables.1$STATEID))
# allvariables.1$EDU.HH<-as.factor(as.character(allvariables.1$EDU.HH))
# allvariables.1$WATER<-as.factor(as.character(allvariables.1$WATER))
# allvariables.1$ ELEC.ACCESS<-as.factor(as.character(allvariables.1$ ELEC.ACCESS))
# allvariables.1$NFE<-as.factor(as.character(allvariables.1$NFE))
# allvariables.1$URBAN2011<-as.factor(as.character(allvariables.1$URBAN2011))
# allvariables.1$NFE.TYPES<-as.factor(as.character(allvariables.1$NFE.TYPES))
# allvariables.1$ELEC.HRS.3<-as.factor(as.character(allvariables.1$ELEC.HRS.3))
# allvariables.1$ELEC.HRS.4<-as.factor(as.character(allvariables.1$ELEC.HRS.4))
# allvariables.1$filter_.<-as.factor(as.character(allvariables.1$filter_.))

# sapply(allvariables.1,class)
# str(allvariables.1)
# summary(allvariables.1)


#total no of rows having NAs
rownos<-which(apply(is.na(allvariables.1),1,sum)>0)
length(rownos)  # so we have at 19278 rows with atleast one NA
## [1] 19278
rowsums<-sum(apply(is.na(allvariables.1),1,sum)>0)
rowsums
## [1] 19278
#NAs count coulmn wise
na_count <-sapply(allvariables.1, function(y) sum(length(which(is.na(y)))))
(na_count <- data.frame(na_count))
##                na_count
## STATEID               0
## DISTID                0
## VNID                  0
## ROOMS               172
## INCOME                0
## EDU.HH              276
## WATER               140
## RICE.P                0
## ELEC.ACCESS         144
## ELEC.HRS           5197
## NFE                   0
## ASSETS               18
## NADULTM               0
## NADULTF               0
## NCHILDM               0
## NCHILDF               0
## NTEENM                0
## NTEENF                0
## URBAN2011             0
## NFE.TYPES         16403
## NADULT                0
## NCHILD                0
## NTEEN                 0
## ELEC.HRS.3         5197
## ELEC.HRS.4         5197
## filter_.              0
## ELEC.HRS.3.NEW     5197
#or
apply(allvariables.1, 2, function(x)sum(is.na(x)))
##        STATEID         DISTID           VNID          ROOMS         INCOME 
##              0              0              0            172              0 
##         EDU.HH          WATER         RICE.P    ELEC.ACCESS       ELEC.HRS 
##            276            140              0            144           5197 
##            NFE         ASSETS        NADULTM        NADULTF        NCHILDM 
##              0             18              0              0              0 
##        NCHILDF         NTEENM         NTEENF      URBAN2011      NFE.TYPES 
##              0              0              0              0          16403 
##         NADULT         NCHILD          NTEEN     ELEC.HRS.3     ELEC.HRS.4 
##              0              0              0           5197           5197 
##       filter_. ELEC.HRS.3.NEW 
##              0           5197
#Lets see which col has NAs
colnos<-which(apply(is.na(allvariables.1),2,sum)>0)
colnos
##          ROOMS         EDU.HH          WATER    ELEC.ACCESS       ELEC.HRS 
##              4              6              7              9             10 
##         ASSETS      NFE.TYPES     ELEC.HRS.3     ELEC.HRS.4 ELEC.HRS.3.NEW 
##             12             20             24             25             27
#lets remove var NFE.TYPES is has huge no of NAs
allvariables.1$NFE.TYPES<-NULL

#Removing rows having even one NA

allvariables.1nonas<-na.omit(allvariables.1)
str(allvariables.1nonas)
## 'data.frame':    34442 obs. of  26 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
##   .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...
summary(allvariables.1nonas)
##               STATEID          DISTID          VNID       
##  Karnataka 29     : 3546   Min.   : 1.0   Min.   : 1.000  
##  Maharashtra 27   : 3095   1st Qu.: 6.0   1st Qu.: 2.000  
##  Madhya Pradesh 23: 2532   Median :12.0   Median : 4.000  
##  Uttar Pradesh 09 : 2326   Mean   :14.6   Mean   : 5.788  
##  Rajasthan 08     : 2296   3rd Qu.:20.0   3rd Qu.: 7.000  
##  Andhra Pradesh 28: 2118   Max.   :68.0   Max.   :39.000  
##  (Other)          :18529                                  
##      ROOMS            INCOME                  EDU.HH        WATER      
##  Min.   : 1.000   Min.   :-1037040   none 0      :20413   No 0 :22154  
##  1st Qu.: 2.000   1st Qu.:   43500   5th class 5 : 3201   Yes 1:12288  
##  Median : 2.000   Median :   80500   Secondary 10: 1960                
##  Mean   : 2.791   Mean   :  136009   4th class 4 : 1551                
##  3rd Qu.: 4.000   3rd Qu.:  154000   8th class 8 : 1417                
##  Max.   :50.000   Max.   :11360000   2nd class 2 : 1246                
##                                      (Other)     : 4654                
##      RICE.P       ELEC.ACCESS      ELEC.HRS        NFE       
##  Min.   :  0.00   No 0 :    0   Min.   : 0.00   No 0 :26760  
##  1st Qu.: 18.00   Yes 1:34442   1st Qu.: 9.00   Yes 1: 7682  
##  Median : 22.00                 Median :16.00                
##  Mean   : 22.59                 Mean   :15.32                
##  3rd Qu.: 27.00                 3rd Qu.:22.00                
##  Max.   :120.00                 Max.   :24.00                
##                                                              
##      ASSETS         NADULTM         NADULTF         NCHILDM      
##  Min.   : 1.00   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:12.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :17.00   Median :1.000   Median :1.000   Median :0.0000  
##  Mean   :16.68   Mean   :1.463   Mean   :1.528   Mean   :0.6926  
##  3rd Qu.:21.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :33.00   Max.   :9.000   Max.   :9.000   Max.   :8.0000  
##                                                                  
##     NCHILDF            NTEENM           NTEENF         URBAN2011    
##  Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000   rural 0:21240  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000   urban 1:13202  
##  Median : 0.0000   Median :0.0000   Median :0.0000                  
##  Mean   : 0.6318   Mean   :0.2868   Mean   :0.2926                  
##  3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000                  
##  Max.   :10.0000   Max.   :5.0000   Max.   :5.0000                  
##                                                                     
##      NADULT           NCHILD           NTEEN            ELEC.HRS.3   
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.0000   No Access:   22  
##  1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.:0.0000   1-16     :15932  
##  Median : 3.000   Median : 1.000   Median :0.0000   17-24    :18488  
##  Mean   : 2.991   Mean   : 1.324   Mean   :0.5795                    
##  3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.:1.0000                    
##  Max.   :18.000   Max.   :18.000   Max.   :7.0000                    
##                                                                      
##  ELEC.HRS.4            filter_.       ELEC.HRS.3.NEW 
##  0-6  : 2930   Not Selected:30607   No Access:   91  
##  6-12 : 8491   Selected    : 3835   1-16     :17168  
##  12-18: 6189                        16-24    :17183  
##  18-24:16832                                         
##                                                      
##                                                      
## 
dim(allvariables.1nonas)
## [1] 34442    26
head(allvariables.1nonas)
##              STATEID DISTID VNID ROOMS  INCOME       EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01      2    1    12  176100       none 0 Yes 1     17
## 2 Jammu & Kashmir 01      2    1    10 1039150       none 0 Yes 1     20
## 3 Jammu & Kashmir 01      2    1     3  182340 Secondary 10 Yes 1     15
## 4 Jammu & Kashmir 01      2    1     4   90760       none 0 Yes 1     20
## 5 Jammu & Kashmir 01      2    1    10  212600       none 0 Yes 1     20
## 6 Jammu & Kashmir 01      2    1     5  152100       none 0 Yes 1     12
##   ELEC.ACCESS ELEC.HRS  NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1       Yes 1       12 No 0     20       2       2       2       2      1
## 2       Yes 1        8 No 0     24       5       3       3       3      1
## 3       Yes 1        8 No 0     22       1       2       1       1      1
## 4       Yes 1        8 No 0     16       1       1       1       1      0
## 5       Yes 1        8 No 0     17       3       3       1       1      2
## 6       Yes 1       14 No 0     12       4       3       2       2      0
##   NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4 filter_.
## 1      0   rural 0      4      4     1       1-16      12-18 Selected
## 2      1   rural 0      8      6     2       1-16       6-12 Selected
## 3      1   rural 0      3      2     2       1-16       6-12 Selected
## 4      1   rural 0      2      2     1       1-16       6-12 Selected
## 5      0   rural 0      6      2     2       1-16       6-12 Selected
## 6      0   rural 0      7      4     0       1-16      12-18 Selected
##   ELEC.HRS.3.NEW
## 1           1-16
## 2           1-16
## 3           1-16
## 4           1-16
## 5           1-16
## 6           1-16
tail(allvariables.1nonas)
##             STATEID DISTID VNID ROOMS INCOME       EDU.HH WATER RICE.P
## 39948 Tamil Nadu 33     30   11     2  97400  6th class 6  No 0     26
## 39949 Tamil Nadu 33     30   11     5  27000  7th class 7  No 0     26
## 39951 Tamil Nadu 33     30   11     3 110900 Secondary 10  No 0     18
## 39952 Tamil Nadu 33     30   11     2  15000  5th class 5  No 0     26
## 39953 Tamil Nadu 33     30   11     1  69040  7th class 7  No 0     23
## 39954 Tamil Nadu 33     30   11     2  37000  6th class 6  No 0     23
##       ELEC.ACCESS ELEC.HRS   NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39948       Yes 1        6  No 0     18       3       2       1       0
## 39949       Yes 1        6 Yes 1     26       1       1       0       0
## 39951       Yes 1        6  No 0     17       1       2       1       0
## 39952       Yes 1        6 Yes 1     19       2       1       1       1
## 39953       Yes 1        8  No 0     15       1       3       0       1
## 39954       Yes 1        6 Yes 1     18       1       1       0       1
##       NTEENM NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4
## 39948      1      0   urban 1      5      1     1       1-16       6-12
## 39949      0      0   urban 1      2      0     0       1-16       6-12
## 39951      0      2   urban 1      3      1     2       1-16       6-12
## 39952      0      0   urban 1      3      2     0       1-16       6-12
## 39953      0      0   urban 1      4      1     0       1-16       6-12
## 39954      1      1   urban 1      2      1     2       1-16       6-12
##           filter_. ELEC.HRS.3.NEW
## 39948 Not Selected           1-16
## 39949 Not Selected           1-16
## 39951 Not Selected           1-16
## 39952 Not Selected           1-16
## 39953 Not Selected           1-16
## 39954 Not Selected           1-16
# # Bar Plot of the STATEID faceted by ELEC.HRS.3 
table(allvariables.1nonas$STATEID,allvariables.1nonas$ELEC.HRS.3)
##                        
##                         No Access 1-16 17-24
##   Jammu & Kashmir 01            0  529   154
##   Himachal Pradesh 02           0   63  1405
##   Punjab 03                     0  161  1523
##   Chandigarh 04                 0    0     0
##   Uttarakhand 05                0  234   209
##   Haryana 06                    0 1546   159
##   Delhi 07                      0   88   799
##   Rajasthan 08                  0 1253  1043
##   Uttar Pradesh 09              1 2032   293
##   Bihar 10                      0  855   101
##   Sikkim 11                     0    0     0
##   Arunachal Pradesh 12          0    0     0
##   Nagaland 13                   0    0     0
##   Manipur 14                    0    0     0
##   Mizoram 15                    0    0     0
##   Tripura 16                    0    0     0
##   Meghalaya 17                  0    0     0
##   Assam 18                      0    0     0
##   West Bengal 19                0  375  1564
##   Jharkhand 20                  0  321   408
##   Orissa 21                     0  430  1068
##   Chhattisgarh 22               0  186  1001
##   Madhya Pradesh 23            20 1942   570
##   Gujarat 24                    0   43  1734
##   Daman & Diu 25                0    0     0
##   Dadra+Nagar Haveli 26         0    0     0
##   Maharashtra 27                0 1334  1761
##   Andhra Pradesh 28             0 1125   993
##   Karnataka 29                  0 1847  1699
##   Goa 30                        0   95    91
##   Lakshadweep 31                0    0     0
##   Kerala 32                     0   50  1468
##   Tamil Nadu 33                 1 1423   445
##   Pondicherry 34                0    0     0
##   Anadman/Nicobar 35            0    0     0
library(ggplot2)
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(STATEID)) + 
  geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~ELEC.HRS.3,ncol = 1,scales = "free_y") + scale_y_continuous(breaks = seq(0,2200, by = 200))+ guides(fill = FALSE)

#scales = "free_y" argument gives each facet own y ticks

theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(ELEC.HRS.3)) + geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~STATEID,scales = "free_y") + guides(fill = F)

theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
  geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(~ELEC.HRS.4,ncol = 1, scales = "free_y" ) + theme(axis.text.x = element_text(angle = 90, face = "bold", size = 8)) + guides(fill = FALSE)

# Bar Plot of the EDU.HH faceted by URBAN2011 and ELEC.HRS.4
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
  geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(URBAN2011~ELEC.HRS.4,ncol = 4,scales = "free_y") + theme(axis.text.x = element_text(angle = 90, size = 8)) + guides(fill = FALSE)

#Bar chart with standard errors

library(tidyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(scales)
library(RColorBrewer)


#custom std error function
std_err <- function(x){
sd(x) / sqrt(length(x))
}

library(dplyr)
df_ind_sum <-allvariables.1 %>%
group_by(STATEID) %>%
summarise(Income_med = median(INCOME),Income_se = std_err(INCOME))

kable(df_ind_sum)
STATEID Income_med Income_se
Jammu & Kashmir 01 149475 9080.923
Himachal Pradesh 02 104430 7858.078
Punjab 03 116000 7131.363
Uttarakhand 05 81970 6768.412
Haryana 06 98475 10288.974
Delhi 07 170000 10714.613
Rajasthan 08 76200 3296.097
Uttar Pradesh 09 55220 2120.620
Bihar 10 49990 2721.459
West Bengal 19 63000 5453.198
Jharkhand 20 55500 4307.792
Orissa 21 47625 2610.151
Chhattisgarh 22 40386 3682.346
Madhya Pradesh 23 47000 2955.330
Gujarat 24 76800 5924.410
Maharashtra 27 81800 2888.217
Andhra Pradesh 28 62000 2271.968
Karnataka 29 73075 3688.832
Goa 30 103500 8841.920
Kerala 32 134680 4764.551
Tamil Nadu 33 89600 3338.673
table(allvariables.1$STATEID)
## 
##    Jammu & Kashmir 01   Himachal Pradesh 02             Punjab 03 
##                   720                  1476                  1702 
##         Chandigarh 04        Uttarakhand 05            Haryana 06 
##                     0                   468                  1806 
##              Delhi 07          Rajasthan 08      Uttar Pradesh 09 
##                   899                  2707                  3824 
##              Bihar 10             Sikkim 11  Arunachal Pradesh 12 
##                  1547                     0                     0 
##           Nagaland 13            Manipur 14            Mizoram 15 
##                     0                     0                     0 
##            Tripura 16          Meghalaya 17              Assam 18 
##                     0                     0                     0 
##        West Bengal 19          Jharkhand 20             Orissa 21 
##                  2435                   853                  2058 
##       Chhattisgarh 22     Madhya Pradesh 23            Gujarat 24 
##                  1324                  3123                  1895 
##        Daman & Diu 25 Dadra+Nagar Haveli 26        Maharashtra 27 
##                     0                     0                  3309 
##     Andhra Pradesh 28          Karnataka 29                Goa 30 
##                  2203                  3865                   188 
##        Lakshadweep 31             Kerala 32         Tamil Nadu 33 
##                     0                  1570                  1982 
##        Pondicherry 34    Anadman/Nicobar 35 
##                     0                     0
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)

#v.2

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_y_continuous(breaks = seq(0, 200000, by = 25000)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)  # changing the y ticks scale

#v 3

#
# This is the function with various ggplot2 parameters that can be customized
#
ggplot_theme <- function() {
    # Generate the colors for the chart procedurally with RColorBrewer
    palette <- brewer.pal("Greys", n=9)
    color.background = "white" #palette[2]
    color.grid.major = palette[3]
    color.axis.text = palette[6]
    color.axis.title = palette[7]
    color.title = palette[9]
    # Begin construction of chart
    theme_bw(base_size=9) +
        # Set the entire chart region to a light gray color
        theme(panel.background=element_rect(fill=color.background, color=color.background)) +
        theme(plot.background=element_rect(fill=color.background, color=color.background)) +
        theme(panel.border=element_rect(color=color.background)) +
        # Format the grid
        theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
        theme(panel.grid.minor=element_blank()) +
        theme(axis.ticks=element_blank()) +
        # Format the legend, but hide by default
        theme(legend.position="none") +
        theme(legend.background = element_rect(fill=color.background)) +
        theme(legend.text = element_text(size=7,color=color.axis.title)) +
        # Set title and axis labels, and format these and tick marks
        theme(plot.title=element_text(color=color.title, size=12, vjust=1.25)) +
        theme(axis.text.x=element_text(size=7,color=color.axis.text)) +
        theme(axis.text.y=element_text(size=7,color=color.axis.text)) +
        theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0)) +
        theme(axis.title.y=element_text(size=10,color=color.axis.title, vjust=1.25)) +
        # Plot margins
        theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) +  coord_flip()+ggplot_theme() 

#reversing the order of the x axis--'states' labels using scale_x_discrete(labels=rev(df_ind_sum$STATEID))

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_x_discrete(labels=rev(df_ind_sum$STATEID)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) +  coord_flip()+ggplot_theme() 

###################

ggplot(df_ind_sum, aes(x = STATEID, y = Income_med,fill=STATEID)) + geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)+coord_polar()

#Line chart using mean of Income

df_ind_mean<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_mean = mean(INCOME),TotalAssets=sum(ASSETS))

kable(df_ind_mean)
STATEID ELEC.HRS.3 Income_mean TotalAssets
Jammu & Kashmir 01 1-16 213974.15 9986
Jammu & Kashmir 01 17-24 260984.08 3470
Jammu & Kashmir 01 NA 345185.00 31
Himachal Pradesh 02 1-16 129297.44 1061
Himachal Pradesh 02 17-24 177701.44 25650
Himachal Pradesh 02 NA 76081.25 80
Punjab 03 1-16 160272.65 3173
Punjab 03 17-24 202844.17 32231
Punjab 03 NA 70748.76 147
Uttarakhand 05 1-16 112795.15 3830
Uttarakhand 05 17-24 164542.92 3913
Uttarakhand 05 NA 38540.83 176
Haryana 06 1-16 169019.06 NA
Haryana 06 17-24 305111.98 NA
Haryana 06 NA 122365.95 859
Delhi 07 1-16 172350.00 1920
Delhi 07 17-24 265812.55 18113
Delhi 07 NA 230660.00 58
Rajasthan 08 1-16 112094.43 17438
Rajasthan 08 17-24 164266.39 NA
Rajasthan 08 NA 70739.98 2951
Uttar Pradesh 09 No Access 132460.00 12
Uttar Pradesh 09 1-16 114182.21 31337
Uttar Pradesh 09 17-24 147426.82 5537
Uttar Pradesh 09 NA 50347.38 10809
Bihar 10 1-16 97682.27 10970
Bihar 10 17-24 139167.57 1853
Bihar 10 NA 44209.16 NA
West Bengal 19 1-16 130822.35 5341
West Bengal 19 17-24 131854.48 24971
West Bengal 19 NA 54718.13 3343
Jharkhand 20 1-16 91288.91 4605
Jharkhand 20 17-24 122659.47 6487
Jharkhand 20 NA 41349.73 681
Orissa 21 1-16 69659.57 5409
Orissa 21 17-24 110416.24 16198
Orissa 21 NA 42482.44 2840
Chhattisgarh 22 1-16 62157.20 1935
Chhattisgarh 22 17-24 96740.57 14100
Chhattisgarh 22 NA 28314.21 649
Madhya Pradesh 23 No Access 22116.30 104
Madhya Pradesh 23 1-16 80403.43 NA
Madhya Pradesh 23 17-24 132155.99 10287
Madhya Pradesh 23 NA 38475.82 3287
Gujarat 24 1-16 50916.48 509
Gujarat 24 17-24 143961.75 NA
Gujarat 24 NA 67489.78 577
Maharashtra 27 1-16 125510.31 20917
Maharashtra 27 17-24 147213.11 32273
Maharashtra 27 NA 59661.96 1390
Andhra Pradesh 28 1-16 77522.75 NA
Andhra Pradesh 28 17-24 98467.15 NA
Andhra Pradesh 28 NA 41195.14 NA
Karnataka 29 1-16 105682.90 NA
Karnataka 29 17-24 148221.97 NA
Karnataka 29 NA 71562.49 NA
Goa 30 1-16 133228.14 2106
Goa 30 17-24 139525.05 1923
Goa 30 NA 50000.00 23
Kerala 32 1-16 141827.86 984
Kerala 32 17-24 174356.54 NA
Kerala 32 NA 212943.95 607
Tamil Nadu 33 No Access 59500.00 16
Tamil Nadu 33 1-16 124014.88 26321
Tamil Nadu 33 17-24 132274.07 8761
Tamil Nadu 33 NA 71132.70 489
str(df_ind_mean)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  66 obs. of  4 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
##  $ Income_mean: num  213974 260984 345185 129297 177701 ...
##  $ TotalAssets: num  9986 3470 31 1061 25650 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
#lets see the nos of NAs
apply(df_ind_mean, 2, function(x)sum(is.na(x)))
##     STATEID  ELEC.HRS.3 Income_mean TotalAssets 
##           0          21           0          13
#lets remove NAs from the rows that have NAs

df<-which(apply(is.na(df_ind_mean),1,sum)>0)
df # row nos that have NAs
##  [1]  3  6  9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66
length(df) #no of rows to be deleted
## [1] 31
df_ind_mean<-df_ind_mean[-df,] #removing NAs
str(df_ind_mean)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  35 obs. of  4 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ Income_mean: num  213974 260984 129297 177701 160273 ...
##  $ TotalAssets: num  9986 3470 1061 25650 3173 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
##  - attr(*, "indices")=List of 18
##   ..$ : int  0 1
##   ..$ : int  2 3
##   ..$ : int  4 5
##   ..$ : int  6 7
##   ..$ : int  8 9
##   ..$ : int 10
##   ..$ : int  11 12 13
##   ..$ : int  14 15
##   ..$ : int  16 17
##   ..$ : int  18 19
##   ..$ : int  20 21
##   ..$ : int  22 23
##   ..$ : int  24 25
##   ..$ : int 26
##   ..$ : int  27 28
##   ..$ : int  29 30
##   ..$ : int 31
##   ..$ : int  32 33 34
##  - attr(*, "group_sizes")= int  2 2 2 2 2 1 3 2 2 2 ...
##  - attr(*, "biggest_group_size")= int 3
##  - attr(*, "labels")='data.frame':   18 obs. of  1 variable:
##   ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
##   ..- attr(*, "vars")=List of 1
##   .. ..$ : symbol STATEID
##   ..- attr(*, "drop")= logi TRUE
#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_mean, 2, function(x)sum(is.na(x)))
##     STATEID  ELEC.HRS.3 Income_mean TotalAssets 
##           0           0           0           0
theme_set(theme_bw())
ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean)) +
  geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

#####################

ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

# from the plot above we see very few states where households falling in NO access category. Lets recheck the nos again

table(allvariables.1$ELEC.HRS.3)
## 
## No Access      1-16     17-24 
##        22     16091     18644
df_ind_mean %>% 
  ggplot(aes(x=TotalAssets)) +
  geom_line(aes(y=Income_mean, color=ELEC.HRS.3))

ggplot(df_ind_mean , aes(x = TotalAssets, y = Income_mean)) +
  geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3)+ scale_y_continuous(labels = comma)

############################################
#Line chart using median Income

df_ind_median<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_median = median(INCOME),TotalAssets=sum(ASSETS))

kable(df_ind_median)
STATEID ELEC.HRS.3 Income_median TotalAssets
Jammu & Kashmir 01 1-16 141000.0 9986
Jammu & Kashmir 01 17-24 174601.0 3470
Jammu & Kashmir 01 NA 345185.0 31
Himachal Pradesh 02 1-16 86550.0 1061
Himachal Pradesh 02 17-24 105719.0 25650
Himachal Pradesh 02 NA 70820.0 80
Punjab 03 1-16 97160.0 3173
Punjab 03 17-24 120000.0 32231
Punjab 03 NA 64100.0 147
Uttarakhand 05 1-16 72000.0 3830
Uttarakhand 05 17-24 108830.0 3913
Uttarakhand 05 NA 32490.0 176
Haryana 06 1-16 96412.5 NA
Haryana 06 17-24 150400.0 NA
Haryana 06 NA 62800.0 859
Delhi 07 1-16 105000.0 1920
Delhi 07 17-24 180000.0 18113
Delhi 07 NA 84700.0 58
Rajasthan 08 1-16 68982.5 17438
Rajasthan 08 17-24 100000.0 NA
Rajasthan 08 NA 50695.0 2951
Uttar Pradesh 09 No Access 132460.0 12
Uttar Pradesh 09 1-16 70200.0 31337
Uttar Pradesh 09 17-24 87850.0 5537
Uttar Pradesh 09 NA 38250.0 10809
Bihar 10 1-16 61500.0 10970
Bihar 10 17-24 87775.0 1853
Bihar 10 NA 36420.0 NA
West Bengal 19 1-16 58740.0 5341
West Bengal 19 17-24 78925.0 24971
West Bengal 19 NA 39850.0 3343
Jharkhand 20 1-16 51500.0 4605
Jharkhand 20 17-24 71330.0 6487
Jharkhand 20 NA 33892.5 681
Orissa 21 1-16 47600.0 5409
Orissa 21 17-24 63000.0 16198
Orissa 21 NA 32242.5 2840
Chhattisgarh 22 1-16 30002.5 1935
Chhattisgarh 22 17-24 47730.0 14100
Chhattisgarh 22 NA 24250.0 649
Madhya Pradesh 23 No Access 18965.0 104
Madhya Pradesh 23 1-16 45707.5 NA
Madhya Pradesh 23 17-24 88800.0 10287
Madhya Pradesh 23 NA 29395.0 3287
Gujarat 24 1-16 37090.0 509
Gujarat 24 17-24 80500.0 NA
Gujarat 24 NA 46600.0 577
Maharashtra 27 1-16 77007.5 20917
Maharashtra 27 17-24 96000.0 32273
Maharashtra 27 NA 53195.0 1390
Andhra Pradesh 28 1-16 58150.0 NA
Andhra Pradesh 28 17-24 72330.0 NA
Andhra Pradesh 28 NA 34625.0 NA
Karnataka 29 1-16 69520.0 NA
Karnataka 29 17-24 85190.0 NA
Karnataka 29 NA 53400.0 NA
Goa 30 1-16 109500.0 2106
Goa 30 17-24 99000.0 1923
Goa 30 NA 50000.0 23
Kerala 32 1-16 118015.0 984
Kerala 32 17-24 135600.0 NA
Kerala 32 NA 112000.0 607
Tamil Nadu 33 No Access 59500.0 16
Tamil Nadu 33 1-16 90000.0 26321
Tamil Nadu 33 17-24 91000.0 8761
Tamil Nadu 33 NA 52435.0 489
str(df_ind_median)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  66 obs. of  4 variables:
##  $ STATEID      : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
##  $ ELEC.HRS.3   : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
##  $ Income_median: num  141000 174601 345185 86550 105719 ...
##  $ TotalAssets  : num  9986 3470 31 1061 25650 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
#lets see the nos of NAs
apply(df_ind_median, 2, function(x)sum(is.na(x)))
##       STATEID    ELEC.HRS.3 Income_median   TotalAssets 
##             0            21             0            13
#lets remove NAs from the rows that have NAs

df<-which(apply(is.na(df_ind_median),1,sum)>0)
df # row nos that have NAs
##  [1]  3  6  9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66
length(df) #no of rows to be deleted
## [1] 31
df_ind_median<-df_ind_median[-df,] #removing NAs
str(df_ind_median)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  35 obs. of  4 variables:
##  $ STATEID      : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
##  $ ELEC.HRS.3   : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ Income_median: num  141000 174601 86550 105719 97160 ...
##  $ TotalAssets  : num  9986 3470 1061 25650 3173 ...
##  - attr(*, "vars")=List of 1
##   ..$ : symbol STATEID
##  - attr(*, "drop")= logi TRUE
##  - attr(*, "indices")=List of 18
##   ..$ : int  0 1
##   ..$ : int  2 3
##   ..$ : int  4 5
##   ..$ : int  6 7
##   ..$ : int  8 9
##   ..$ : int 10
##   ..$ : int  11 12 13
##   ..$ : int  14 15
##   ..$ : int  16 17
##   ..$ : int  18 19
##   ..$ : int  20 21
##   ..$ : int  22 23
##   ..$ : int  24 25
##   ..$ : int 26
##   ..$ : int  27 28
##   ..$ : int  29 30
##   ..$ : int 31
##   ..$ : int  32 33 34
##  - attr(*, "group_sizes")= int  2 2 2 2 2 1 3 2 2 2 ...
##  - attr(*, "biggest_group_size")= int 3
##  - attr(*, "labels")='data.frame':   18 obs. of  1 variable:
##   ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
##   ..- attr(*, "vars")=List of 1
##   .. ..$ : symbol STATEID
##   ..- attr(*, "drop")= logi TRUE
#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_median, 2, function(x)sum(is.na(x)))
##       STATEID    ELEC.HRS.3 Income_median   TotalAssets 
##             0             0             0             0
theme_set(theme_bw())
ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median)) +
  geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

df_ind_median %>% 
  ggplot(aes(x=TotalAssets)) +
  geom_line(aes(y=Income_median, color=ELEC.HRS.3))

ggplot(df_ind_median, aes(x = TotalAssets, y = Income_median)) +
  geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3) + scale_y_continuous(labels = comma)

# Boxplots

#lets subset relevant columns
allvariables.1new<-allvariables.1 %>% select(c(STATEID,INCOME,NFE,ASSETS,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1new)
##              STATEID  INCOME  NFE ASSETS ELEC.HRS.3 ELEC.HRS.4
## 1 Jammu & Kashmir 01  176100 No 0     20       1-16      12-18
## 2 Jammu & Kashmir 01 1039150 No 0     24       1-16       6-12
## 3 Jammu & Kashmir 01  182340 No 0     22       1-16       6-12
## 4 Jammu & Kashmir 01   90760 No 0     16       1-16       6-12
## 5 Jammu & Kashmir 01  212600 No 0     17       1-16       6-12
## 6 Jammu & Kashmir 01  152100 No 0     12       1-16      12-18
#lets see the nos of NAs
apply(allvariables.1new, 2, function(x)sum(is.na(x)))
##    STATEID     INCOME        NFE     ASSETS ELEC.HRS.3 ELEC.HRS.4 
##          0          0          0         18       5197       5197
df1<-which(apply(is.na(allvariables.1new),1,sum)>0)
#df1 # row nos that have NAs
length(df1) #no of rows to be deleted
## [1] 5209
df_new<-allvariables.1new[-df1,] #removing NAs
str(df_new)
## 'data.frame':    34745 obs. of  6 variables:
##  $ STATEID   : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME    : num  176100 1039150 182340 90760 212600 ...
##  $ NFE       : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS    : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ ELEC.HRS.3: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4: Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_new, 2, function(x)sum(is.na(x)))
##    STATEID     INCOME        NFE     ASSETS ELEC.HRS.3 ELEC.HRS.4 
##          0          0          0          0          0          0
ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.4))

ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.3))

p1 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_boxplot(aes(fill = ELEC.HRS.3), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot")  # boxplot
p2 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_violin(aes(fill = ELEC.HRS.3), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)")  # violin plot
gridExtra::grid.arrange(p1, p2, ncol=2)

p3 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_boxplot(aes(fill = NFE), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot")  # boxplot
p4 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_violin(aes(fill = NFE), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)")  # violin plot
gridExtra::grid.arrange(p3, p4, ncol=2)

library(extracat)
rmb(formula = ~ NFE+ ELEC.HRS.3+ ELEC.HRS.4, data = df_new)

barplot(table(df_new$NFE,log(df_new$INCOME)),legend.text = TRUE,main = "Log(Income) by NFE",xlab = "Log(Income)")

barplot(table(df_new$NFE,df_new$ASSETS),legend.text = TRUE,main = "Assets by NFE",xlab = "Assets")

ggplot(df_new)+
    geom_bar( aes(x = ASSETS , fill = NFE))+
    theme_bw()+
    theme(panel.border = element_blank(),
          axis.line = element_line(color = "black"))+
    ggtitle("Assets by NFE")+
    scale_colour_manual(name = "NFE", values = c("#11c2d7", "#9f0303"))

barplot(table(df_new$ELEC.HRS.3,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.3",xlab = "Assets")

barplot(table(df_new$ELEC.HRS.4,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.4",xlab = "Assets")

ggplot(df_new)+
    geom_bar( aes(x = ASSETS , fill = ELEC.HRS.4))+
    theme_bw()+
    theme(panel.border = element_blank(),
          axis.line = element_line(color = "black"))+ ggtitle("Assets by ELEC.HRS.4")

barplot(table(df_new$NFE,df_new$ELEC.HRS.4),legend.text = TRUE,main = "ELEC.HRS.4 by NFE",xlab = "ELEC.HRS.4 ")

plot(df_new$ELEC.HRS.4)

mosaicplot(table(df_new$NFE,df_new$ELEC.HRS.4),xlab = "NFE",ylab="ELEC.HRS.4")

df_new %>%
  ggplot(aes(x = ASSETS, y = log(INCOME))) +
   geom_boxplot(aes(group=ASSETS)) +
  geom_point(aes(color=ELEC.HRS.3)) +
  geom_smooth(method="lm", aes(color=ELEC.HRS.3)) +
  labs(title = "Assets vs. log(INCOME) grouped by ELEC.HRS.3 ") + 
  theme(plot.title=element_text(size=20)) 

ggplot(allvariables.1nonas, 
       aes(x = ASSETS, y = ROOMS , color = ELEC.HRS.3)) +
  geom_line() +
  ggtitle("Assets vs. Rooms grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas, 
       aes(x = RICE.P, y = ROOMS , color = ELEC.HRS.3)) +
  geom_line() +
  ggtitle("RICE.P vs. ROOMS grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas, 
       aes(x = RICE.P, y = ROOMS , color =NFE)) +
  geom_line() +
  ggtitle("RICE.P vs. ROOMS grouped by NFE")

#############################

library(beanplot)
library(RColorBrewer)

bean.cols <- lapply(brewer.pal(6, "Set3"),
function(x) {return(c(x, "black", "black", "black"))})

beanplot(ASSETS ~ ELEC.HRS.4,
data = df_new,
main = "Relationship between ELEC.HRS.4 and ASSETS",
xlab = "ELEC.HRS.4",
ylab = "ASSETS",
col = bean.cols ,lwd = 1,what = c(1, 1, 1, 1))

# In one panel

#lets subset relevant columns
allvariables.1box<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1box)
##              STATEID  INCOME ELEC.ACCESS  NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01  176100       Yes 1 No 0   rural 0       1-16
## 2 Jammu & Kashmir 01 1039150       Yes 1 No 0   rural 0       1-16
## 3 Jammu & Kashmir 01  182340       Yes 1 No 0   rural 0       1-16
## 4 Jammu & Kashmir 01   90760       Yes 1 No 0   rural 0       1-16
## 5 Jammu & Kashmir 01  212600       Yes 1 No 0   rural 0       1-16
## 6 Jammu & Kashmir 01  152100       Yes 1 No 0   rural 0       1-16
##   ELEC.HRS.4
## 1      12-18
## 2       6-12
## 3       6-12
## 4       6-12
## 5       6-12
## 6      12-18
#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0         144           0           0        5197 
##  ELEC.HRS.4 
##        5197
df2<-which(apply(is.na(allvariables.1box[,c(1,2,3,4)]),1,sum)>0)
#df1 # row nos that have NAs
length(df2) #no of rows to be deleted
## [1] 144
df_g1<-allvariables.1box[-df2,] #removing NAs
str(df_g1)
## 'data.frame':    39810 obs. of  7 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g1, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0           0           0           0        5055 
##  ELEC.HRS.4 
##        5055
g1<-ggplot(data = df_g1, aes(x = ELEC.ACCESS, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)

g2<-ggplot(data = df_g1, aes(x = NFE, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g3<-ggplot(data =df_g1, aes(x = URBAN2011, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

#lets subset relevant columns
allvariables.1box2<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1box)
##              STATEID  INCOME ELEC.ACCESS  NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01  176100       Yes 1 No 0   rural 0       1-16
## 2 Jammu & Kashmir 01 1039150       Yes 1 No 0   rural 0       1-16
## 3 Jammu & Kashmir 01  182340       Yes 1 No 0   rural 0       1-16
## 4 Jammu & Kashmir 01   90760       Yes 1 No 0   rural 0       1-16
## 5 Jammu & Kashmir 01  212600       Yes 1 No 0   rural 0       1-16
## 6 Jammu & Kashmir 01  152100       Yes 1 No 0   rural 0       1-16
##   ELEC.HRS.4
## 1      12-18
## 2       6-12
## 3       6-12
## 4       6-12
## 5       6-12
## 6      12-18
#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0         144           0           0        5197 
##  ELEC.HRS.4 
##        5197
df3<-which(apply(is.na(allvariables.1box[,c(1,2,4,5,6,7)]),1,sum)>0)
#df1 # row nos that have NAs
length(df3) #no of rows to be deleted
## [1] 5197
df_g4<-allvariables.1box[-df3,] #removing NAs
str(df_g4)
## 'data.frame':    34757 obs. of  7 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g4, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS         NFE   URBAN2011  ELEC.HRS.3 
##           0           0           2           0           0           0 
##  ELEC.HRS.4 
##           0
g4<-ggplot(data = df_g4, aes(x = ELEC.HRS.3, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g5<-ggplot(data = df_g4, aes(x = ELEC.HRS.4, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) 

g6<-ggplot(data = df_g4, aes(x = STATEID, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) + theme(axis.text= element_text(size=6, angle=90))

library(gridExtra)
grid.arrange(g1,g2,g3,g4,g5,g6,ncol=2)

#Scatterplots

#lets subset relevant columns
allvariables.1scat<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,ELEC.HRS,NFE,ASSETS,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))

head(allvariables.1scat)
##              STATEID  INCOME ELEC.ACCESS ELEC.HRS  NFE ASSETS URBAN2011
## 1 Jammu & Kashmir 01  176100       Yes 1       12 No 0     20   rural 0
## 2 Jammu & Kashmir 01 1039150       Yes 1        8 No 0     24   rural 0
## 3 Jammu & Kashmir 01  182340       Yes 1        8 No 0     22   rural 0
## 4 Jammu & Kashmir 01   90760       Yes 1        8 No 0     16   rural 0
## 5 Jammu & Kashmir 01  212600       Yes 1        8 No 0     17   rural 0
## 6 Jammu & Kashmir 01  152100       Yes 1       14 No 0     12   rural 0
##   ELEC.HRS.3 ELEC.HRS.4
## 1       1-16      12-18
## 2       1-16       6-12
## 3       1-16       6-12
## 4       1-16       6-12
## 5       1-16       6-12
## 6       1-16      12-18
#lets see the nos of NAs
apply(allvariables.1scat, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0         144        5197           0          18 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0        5197        5197
df4<-which(apply(is.na(allvariables.1scat[,c(1,2,3)]),1,sum)>0)
#df1 # row nos that have NAs
length(df4) #no of rows to be deleted
## [1] 144
df_g5<-allvariables.1scat[-df4,] #removing NAs
str(df_g5)
## 'data.frame':    39810 obs. of  9 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS   : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS     : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g5, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0           0        5055           0          12 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0        5055        5055
table(df_g5$ELEC.HRS)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##   22   70  264  467  906 1245 2125  783 2653  713 2123  152 2487  246  583 
##   15   16   17   18   19   20   21   22   23   24 
## 1272 1313  351 2535  330 3882  624 3273 2155 4181
table(df_g5$NFE,df_g5$ELEC.ACCESS,df_g5$ELEC.HRS.3)
## , ,  = No Access
## 
##        
##          No 0 Yes 1
##   No 0      0    20
##   Yes 1     0     2
## 
## , ,  = 1-16
## 
##        
##          No 0 Yes 1
##   No 0      0 12735
##   Yes 1     0  3354
## 
## , ,  = 17-24
## 
##        
##          No 0 Yes 1
##   No 0      0 14252
##   Yes 1     0  4392
ggplot(df_g5,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

ggplot(df_g5,aes(NFE,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("NFE and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

str(allvariables.1nonas)
## 'data.frame':    34442 obs. of  26 variables:
##  $ STATEID       : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTID        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS         : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME        : num  176100 1039150 182340 90760 212600 ...
##  $ EDU.HH        : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
##  $ WATER         : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ RICE.P        : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.ACCESS   : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS      : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE           : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS        : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM       : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF       : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM       : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF       : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM        : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF        : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ URBAN2011     : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NADULT        : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD        : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN         : num  1 2 2 1 2 0 4 1 3 0 ...
##  $ ELEC.HRS.3    : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4    : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
##  $ filter_.      : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
##   .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...
ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4 , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.3 , ncol=3) + ggtitle("Assets and Income grouped by ELEC.HRS.3") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~NFE , ncol=2) + ggtitle("Elect Hours and Income grouped by NFE") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4  , ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~URBAN2011 + ELEC.HRS.4, ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4 & URBAN2011") + scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})

ggplot(df_g5, aes(x=ELEC.ACCESS,y=INCOME))+geom_point(aes(colour=STATEID))+ facet_wrap(~STATEID)+scale_y_continuous(labels = comma)

ggplot(df_g5, aes(x=ASSETS,y=log(INCOME)))+geom_point(aes(colour=STATEID)) + facet_wrap(~STATEID)+stat_smooth() 

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4)) + theme_grey()

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4), position="dodge") 

ggplot(df_g4)+geom_density(aes(x=log(INCOME), colour=ELEC.HRS.4))

ggplot(df_g4)+geom_density(aes(x=log(INCOME), fill=ELEC.HRS.4))

###################################

#extract only numeric columns from data frame 
library(dplyr)
new_df <- allvariables.1[sapply(allvariables.1,is.numeric)]
new_df<-na.omit(new_df)
dim(new_df)
## [1] 34719    16
str(new_df)
## 'data.frame':    34719 obs. of  16 variables:
##  $ DISTID  : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ VNID    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ROOMS   : num  12 10 3 4 10 5 5 2 7 2 ...
##  $ INCOME  : num  176100 1039150 182340 90760 212600 ...
##  $ RICE.P  : num  17 20 15 20 20 12 25 25 12 25 ...
##  $ ELEC.HRS: num  12 8 8 8 8 14 3 22 22 8 ...
##  $ ASSETS  : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ NADULTM : num  2 5 1 1 3 4 2 1 1 1 ...
##  $ NADULTF : num  2 3 2 1 3 3 2 1 1 1 ...
##  $ NCHILDM : num  2 3 1 1 1 2 0 4 3 2 ...
##  $ NCHILDF : num  2 3 1 1 1 2 0 1 0 0 ...
##  $ NTEENM  : num  1 1 1 0 2 0 0 0 2 0 ...
##  $ NTEENF  : num  0 1 1 1 0 0 4 1 1 0 ...
##  $ NADULT  : num  4 8 3 2 6 7 4 2 2 2 ...
##  $ NCHILD  : num  4 6 2 2 2 4 0 5 3 2 ...
##  $ NTEEN   : num  1 2 2 1 2 0 4 1 3 0 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:5235] 309 500 799 1187 1352 1572 1622 1955 1967 2161 ...
##   .. ..- attr(*, "names")= chr [1:5235] "309" "500" "799" "1187" ...
#Interactive Data Tables
library(DT)
datatable(new_df, options = list(pageLength = 5))
library(knitr)
kable(head(new_df))
DISTID VNID ROOMS INCOME RICE.P ELEC.HRS ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM NTEENF NADULT NCHILD NTEEN
2 1 12 176100 17 12 20 2 2 2 2 1 0 4 4 1
2 1 10 1039150 20 8 24 5 3 3 3 1 1 8 6 2
2 1 3 182340 15 8 22 1 2 1 1 1 1 3 2 2
2 1 4 90760 20 8 16 1 1 1 1 0 1 2 2 1
2 1 10 212600 20 8 17 3 3 1 1 2 0 6 2 2
2 1 5 152100 12 14 12 4 3 2 2 0 0 7 4 0
colnos<-which(apply(is.na(new_df),2,sum)>0)
colnos
## named integer(0)
sapply(new_df,class)
##    DISTID      VNID     ROOMS    INCOME    RICE.P  ELEC.HRS    ASSETS 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##   NADULTM   NADULTF   NCHILDM   NCHILDF    NTEENM    NTEENF    NADULT 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##    NCHILD     NTEEN 
## "numeric" "numeric"
sapply(new_df,mean)
##       DISTID         VNID        ROOMS       INCOME       RICE.P 
## 1.459187e+01 5.799677e+00 2.792678e+00 1.361545e+05 2.260492e+01 
##     ELEC.HRS       ASSETS      NADULTM      NADULTF      NCHILDM 
## 1.532054e+01 1.668634e+01 1.463925e+00 1.528903e+00 6.913794e-01 
##      NCHILDF       NTEENM       NTEENF       NADULT       NCHILD 
## 6.300873e-01 2.871338e-01 2.926928e-01 2.992828e+00 1.321467e+00 
##        NTEEN 
## 5.798266e-01
#Dot plots

df6<-which(apply(is.na(allvariables.1scat[,c(1:9)]),1,sum)>0)
#df1 # row nos that have NAs
length(df6) #no of rows to be deleted
## [1] 5211
df_g7<-allvariables.1scat[-df6,] #removing NAs
str(df_g7)
## 'data.frame':    34743 obs. of  9 variables:
##  $ STATEID    : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ INCOME     : num  176100 1039150 182340 90760 212600 ...
##  $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ELEC.HRS   : num  12 8 8 8 8 14 3 22 22 8 ...
##  $ NFE        : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASSETS     : num  20 24 22 16 17 12 13 7 11 10 ...
##  $ URBAN2011  : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
##  $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g7, 2, function(x)sum(is.na(x)))
##     STATEID      INCOME ELEC.ACCESS    ELEC.HRS         NFE      ASSETS 
##           0           0           0           0           0           0 
##   URBAN2011  ELEC.HRS.3  ELEC.HRS.4 
##           0           0           0
ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.3) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.4) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

#lets see the distribution of var Income

# Histogram after some cleaning(remove very high values, seem like errors)
ggplot(df_g5, aes(INCOME)) + geom_histogram(color = "white",bins = 40) + theme(axis.text.x = element_text(angle = 90, hjust = 1,size = 8)) + scale_x_continuous(breaks = seq(-100000, 15000000, 1000000),labels = comma) 

ggplot(df_g5)+geom_histogram(aes(x=INCOME),fill="darkgreen")+ theme_grey()

#lets see how distribution changes across differnt factor levels

library(mosaic)
histogram(~ INCOME | ELEC.ACCESS, layout=c(1, 2), data=df_g5,main="Income by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | ELEC.ACCESS, layout=c(1, 2), data=df_g5 ,main="Assets by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | URBAN2011, layout=c(1, 2), data=df_g5 ,main="Assets by URBAN2011",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.3, data=df_g5 ,main="Assets by ELEC.HRS.3",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.4, data=df_g5 ,main="Assets by ELEC.HRS.4",col="darkgreen")

histogram(~ ASSETS | NFE, data=df_g5 ,main="Assets by NFE",col="darkgreen")

histogram(~ ASSETS | STATEID, data=df_g5 ,main="Assets by STATEID",col="darkgreen")

histogram(~ INCOME | STATEID, data=df_g5 ,main="INCOME by STATEID",col="darkgreen")

#using ggplot2

# Faceted by ELEC.HRS.4
ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  facet_grid(~ELEC.HRS.4) +
  guides(fill = FALSE)

# Faceted by ELEC.HRS.4 and URBAN2011
ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") +
  guides(fill = FALSE)

ggplot(df_g5, aes(ASSETS)) + 
  geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) + 
  theme(axis.text.x = element_text(angle = 270)) + 
  facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") +   scale_x_continuous(breaks = seq(0, 40, 4)) + guides(fill = FALSE)

# using arg angle = 270 instead of angle = 90

#############################

library(Hmisc)
tmp <- new_df$INCOME
qqnorm(tmp)
qqline(tmp)
tmp2 <- subplot( hist(tmp,xlab='',ylab='',main=''),
                 cnvrt.coords(0.1,0.9,'plt')$usr, vadj=1, hadj=0 )
op <- par(no.readonly=TRUE)
par(tmp2)
abline(v=0, col='green')

#various ways to summarise

summary(new_df)
##      DISTID           VNID          ROOMS            INCOME        
##  Min.   : 1.00   Min.   : 1.0   Min.   : 1.000   Min.   :-1037040  
##  1st Qu.: 6.00   1st Qu.: 2.0   1st Qu.: 2.000   1st Qu.:   43500  
##  Median :12.00   Median : 4.0   Median : 2.000   Median :   80590  
##  Mean   :14.59   Mean   : 5.8   Mean   : 2.793   Mean   :  136154  
##  3rd Qu.:20.00   3rd Qu.: 7.0   3rd Qu.: 4.000   3rd Qu.:  154222  
##  Max.   :68.00   Max.   :39.0   Max.   :50.000   Max.   :11360000  
##      RICE.P         ELEC.HRS         ASSETS         NADULTM     
##  Min.   :  0.0   Min.   : 0.00   Min.   : 1.00   Min.   :0.000  
##  1st Qu.: 18.0   1st Qu.: 9.00   1st Qu.:12.00   1st Qu.:1.000  
##  Median : 22.0   Median :16.00   Median :17.00   Median :1.000  
##  Mean   : 22.6   Mean   :15.32   Mean   :16.69   Mean   :1.464  
##  3rd Qu.: 27.0   3rd Qu.:22.00   3rd Qu.:21.00   3rd Qu.:2.000  
##  Max.   :120.0   Max.   :24.00   Max.   :33.00   Max.   :9.000  
##     NADULTF         NCHILDM          NCHILDF            NTEENM      
##  Min.   :0.000   Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median : 0.0000   Median :0.0000  
##  Mean   :1.529   Mean   :0.6914   Mean   : 0.6301   Mean   :0.2871  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.:0.0000  
##  Max.   :9.000   Max.   :8.0000   Max.   :10.0000   Max.   :5.0000  
##      NTEENF           NADULT           NCHILD           NTEEN       
##  Min.   :0.0000   Min.   : 0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0.0000   Median : 3.000   Median : 1.000   Median :0.0000  
##  Mean   :0.2927   Mean   : 2.993   Mean   : 1.321   Mean   :0.5798  
##  3rd Qu.:0.0000   3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.:1.0000  
##  Max.   :5.0000   Max.   :18.000   Max.   :18.000   Max.   :7.0000
library(Hmisc)
describe(new_df)
## new_df 
## 
##  16  Variables      34719  Observations
## ---------------------------------------------------------------------------
## DISTID 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      60       1   14.59       2       3       6      12 
##     .75     .90     .95 
##      20      29      34 
## 
## lowest :  1  2  3  4  5, highest: 63 65 66 67 68 
## ---------------------------------------------------------------------------
## VNID 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      39    0.99     5.8       1       1       2       4 
##     .75     .90     .95 
##       7      11      15 
## 
## lowest :  1  2  3  4  5, highest: 35 36 37 38 39 
## ---------------------------------------------------------------------------
## ROOMS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      25    0.94   2.793       1       1       2       2 
##     .75     .90     .95 
##       4       5       6 
## 
## lowest :  1  2  3  4  5, highest: 23 24 25 26 50 
## ---------------------------------------------------------------------------
## INCOME 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0   13698       1  136154   12000   22000   43500   80590 
##     .75     .90     .95 
##  154223  289606  423000 
## 
## lowest : -1037040  -867025  -245000  -214475  -208138
## highest:  8096550  8322000  9563500 11169820 11360000 
## ---------------------------------------------------------------------------
## RICE.P 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      96    0.99    22.6      10      15      18      22 
##     .75     .90     .95 
##      27      32      35 
## 
## lowest :   0.0   0.5   1.0   2.0   2.5
## highest:  80.0  90.0  95.0 100.0 120.0 
## ---------------------------------------------------------------------------
## ELEC.HRS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      25    0.99   15.32       5       6       9      16 
##     .75     .90     .95 
##      22      24      24 
## 
## lowest :  0  1  2  3  4, highest: 20 21 22 23 24 
## ---------------------------------------------------------------------------
## ASSETS 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      33       1   16.69       7       8      12      17 
##     .75     .90     .95 
##      21      24      26 
## 
## lowest :  1  2  3  4  5, highest: 29 30 31 32 33 
## ---------------------------------------------------------------------------
## NADULTM 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      10    0.81   1.464       0       1       1       1 
##     .75     .90     .95 
##       2       3       3 
## 
##              0     1    2    3   4   5  6 7 8 9
## Frequency 2411 19192 8961 3216 716 160 49 8 4 2
## %            7    55   26    9   2   0  0 0 0 0
## ---------------------------------------------------------------------------
## NADULTF 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      10    0.79   1.529       1       1       1       1 
##     .75     .90     .95 
##       2       3       3 
## 
##             0     1     2    3   4   5  6 7 8 9
## Frequency 687 19611 10772 2853 651 117 20 6 1 1
## %           2    56    31    8   2   0  0 0 0 0
## ---------------------------------------------------------------------------
## NCHILDM 
##       n missing  unique    Info    Mean 
##   34719       0       9    0.83  0.6914 
## 
##               0     1    2   3   4  5  6 7 8
## Frequency 18125 10809 4549 943 217 60 13 2 1
## %            52    31   13   3   1  0  0 0 0
## ---------------------------------------------------------------------------
## NCHILDF 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      11    0.78  0.6301       0       0       0       0 
##     .75     .90     .95 
##       1       2       2 
## 
##               0    1    2    3   4  5  6 7 8 9 10
## Frequency 20068 9543 3606 1069 305 90 28 7 1 1  1
## %            58   27   10    3   1  0  0 0 0 0  0
## ---------------------------------------------------------------------------
## NTEENM 
##       n missing  unique    Info    Mean 
##   34719       0       6    0.55  0.2871 
## 
##               0    1    2   3 4 5
## Frequency 26402 6801 1390 117 8 1
## %            76   20    4   0 0 0
## ---------------------------------------------------------------------------
## NTEENF 
##       n missing  unique    Info    Mean 
##   34719       0       6    0.56  0.2927 
## 
##               0    1    2   3  4 5
## Frequency 26340 6803 1394 159 21 2
## %            76   20    4   0  0 0
## ---------------------------------------------------------------------------
## NADULT 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      17    0.91   2.993       1       2       2       3 
##     .75     .90     .95 
##       4       5       6 
## 
##            0    1     2    3    4    5    6   7   8  9 10 11 12 13 14 16
## Frequency 32 2157 14304 8014 5621 2442 1321 452 217 83 40 18  8  6  2  1
## %          0    6    41   23   16    7    4   1   1  0  0  0  0  0  0  0
##           18
## Frequency  1
## %          0
## ---------------------------------------------------------------------------
## NCHILD 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   34719       0      14    0.92   1.321       0       0       0       1 
##     .75     .90     .95 
##       2       3       4 
## 
##               0    1    2    3    4   5   6  7  8  9 10 11 15 18
## Frequency 13008 7641 7948 3679 1529 565 201 83 34 18 10  1  1  1
## %            37   22   23   11    4   2   1  0  0  0  0  0  0  0
## ---------------------------------------------------------------------------
## NTEEN 
##       n missing  unique    Info    Mean 
##   34719       0       8    0.76  0.5798 
## 
##               0    1    2   3   4  5 6 7
## Frequency 21195 8286 4070 998 145 21 2 2
## %            61   24   12   3   0  0 0 0
## ---------------------------------------------------------------------------
options(scipen = T)
library(fBasics)
kable(basicStats(new_df),digits =2)
DISTID VNID ROOMS INCOME RICE.P ELEC.HRS ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM NTEENF NADULT NCHILD NTEEN
nobs 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00 34719.00
NAs 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
Minimum 1.00 1.00 1.00 -1037040.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
Maximum 68.00 39.00 50.00 11360000.00 120.00 24.00 33.00 9.00 9.00 8.00 10.00 5.00 5.00 18.00 18.00 7.00
1. Quartile 6.00 2.00 2.00 43500.00 18.00 9.00 12.00 1.00 1.00 0.00 0.00 0.00 0.00 2.00 0.00 0.00
3. Quartile 20.00 7.00 4.00 154222.50 27.00 22.00 21.00 2.00 2.00 1.00 1.00 0.00 0.00 4.00 2.00 1.00
Mean 14.59 5.80 2.79 136154.48 22.60 15.32 16.69 1.46 1.53 0.69 0.63 0.29 0.29 2.99 1.32 0.58
Median 12.00 4.00 2.00 80590.00 22.00 16.00 17.00 1.00 1.00 0.00 0.00 0.00 0.00 3.00 1.00 0.00
Sum 506615.00 201359.00 96959.00 4727147456.95 784820.30 531914.00 579333.00 50826.00 53082.00 24004.00 21876.00 9969.00 10162.00 103908.00 45880.00 20131.00
SE Mean 0.06 0.03 0.01 1227.71 0.05 0.04 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.00
LCL Mean 14.47 5.74 2.77 133748.14 22.51 15.25 16.62 1.45 1.52 0.68 0.62 0.28 0.29 2.98 1.31 0.57
UCL Mean 14.71 5.86 2.81 138560.83 22.70 15.39 16.75 1.47 1.54 0.70 0.64 0.29 0.30 3.01 1.34 0.59
Variance 133.65 34.13 2.96 52330646462.37 80.08 46.21 36.27 0.80 0.63 0.76 0.82 0.31 0.32 2.05 1.93 0.72
Stdev 11.56 5.84 1.72 228758.93 8.95 6.80 6.02 0.89 0.79 0.87 0.91 0.56 0.57 1.43 1.39 0.85
Skewness 1.74 2.96 2.63 15.86 0.70 -0.28 -0.07 1.23 1.34 1.37 1.78 1.96 2.06 1.44 1.24 1.46
Kurtosis 4.42 11.09 25.80 549.95 5.82 -1.34 -0.69 2.80 2.79 2.31 4.35 3.74 4.51 3.56 2.73 1.84
library(psych)
kable(describe(new_df),digits = 2)
vars n mean sd median trimmed mad min max range skew kurtosis se
DISTID 1 34719 14.59 11.56 12 13.06 10.38 1 68 67 1.74 4.42 0.06
VNID 2 34719 5.80 5.84 4 4.73 2.97 1 39 38 2.96 11.09 0.03
ROOMS 3 34719 2.79 1.72 2 2.56 1.48 1 50 49 2.63 25.80 0.01
INCOME 4 34719 136154.48 228758.93 80590 99396.43 68125.47 -1037040 11360000 12397040 15.86 549.95 1227.71
RICE.P 5 34719 22.60 8.95 22 22.48 5.93 0 120 120 0.70 5.82 0.05
ELEC.HRS 6 34719 15.32 6.80 16 15.62 8.90 0 24 24 -0.28 -1.34 0.04
ASSETS 7 34719 16.69 6.02 17 16.75 7.41 1 33 32 -0.07 -0.69 0.03
NADULTM 8 34719 1.46 0.89 1 1.37 0.00 0 9 9 1.23 2.80 0.00
NADULTF 9 34719 1.53 0.79 1 1.40 0.00 0 9 9 1.34 2.79 0.00
NCHILDM 10 34719 0.69 0.87 0 0.56 0.00 0 8 8 1.37 2.31 0.00
NCHILDF 11 34719 0.63 0.91 0 0.46 0.00 0 10 10 1.78 4.35 0.00
NTEENM 12 34719 0.29 0.56 0 0.17 0.00 0 5 5 1.96 3.74 0.00
NTEENF 13 34719 0.29 0.57 0 0.18 0.00 0 5 5 2.06 4.51 0.00
NADULT 14 34719 2.99 1.43 3 2.81 1.48 0 18 18 1.44 3.56 0.01
NCHILD 15 34719 1.32 1.39 1 1.13 1.48 0 18 18 1.24 2.73 0.01
NTEEN 16 34719 0.58 0.85 0 0.43 0.00 0 7 7 1.46 1.84 0.00
#scatterplot matrix
library(car)
scatterplotMatrix(new_df[,3:7],diagonal="histogram",smooth=FALSE)

library(gpairs)
gpairs(new_df[,3:7], upper.pars = list(scatter = 'stats'),stat.pars = list(verbose = FALSE))

#density plot
par(mfrow=c(3,3), mar=c(2.5,2,1.5,1.5))
colnames <- dimnames(new_df)[[2]]
for (i in 3:7) {
  d <- density(new_df[,i])
  plot(d, type="n", main=colnames[i])
  polygon(d, col="red", border="gray")
}

par(mfrow=c(1,1))

my Files for reference:

**file:///J:/rstudio%20files/BoulderBCycle.html**