library(ggplot2)
library(tidyverse)
require(cowplot)#makes the grids
setwd("C:/Users/nirma/Documents/EDX courses/MicroMaster MIT/14.310x-Data Analysis for Social Scientists/Programs/")
bihar_data <- read_csv("Bihar_sample_data.csv")
print(bihar_data)
# A tibble: 39,553 x 6
personid female adult age height_cm weight_kg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11010101 0 1 70 164. 48.9
2 11010102 0 1 32 157. 44
3 11010103 1 1 28 150. 37.7
4 11010104 0 0 12 146. 30.7
5 11010105 1 0 11 135. 30.2
6 11010201 0 1 38 174. 67.7
7 11010202 1 1 30 140. 57.3
8 11010203 0 0 15 163. 59.3
9 11010204 0 0 10 149. 40.7
10 11010205 1 0 16 153. 43.9
# ... with 39,543 more rows
bihar_adult_females <-filter(bihar_data, adult==1,female==1)
print(bihar_adult_females)#checking if that worked
# A tibble: 11,664 x 6
personid female adult age height_cm weight_kg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11010103 1 1 28 150. 37.7
2 11010202 1 1 30 140. 57.3
3 11010207 1 1 35 148. 38.9
4 11010302 1 1 48 145. 35.7
5 11010303 1 1 22 NA NA
6 11010306 1 1 18 NA NA
7 11010308 1 1 28 145. 42.4
8 11010402 1 1 58 156. 51.1
9 11010404 1 1 36 156. 50.7
10 11010407 1 1 55 156. 47.2
# ... with 11,654 more rows
ggplot(bihar_adult_females, aes(height_cm))+
geom_histogram()
Because some people look like they are very small
bihar_adult_females_trunc <-filter(bihar_adult_females, height_cm>120, height_cm<200)
ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="blue", color="darkblue")+
xlab("Height in centimeters, Bihar Females")
bihar1 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="blue", color="darkblue", binwidth = 5)+
xlab("bin width=5")+
ylab("")
bihar1
bihar2 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="blue", color="darkblue", binwidth = 10)+
xlab("bin width=10")+
ylab("")
#ggsave("output/bihar_bin10.pdf") if we want to save the plot in our local drive
bihar2
bihar3 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="blue", color="darkblue", binwidth = 20)+
xlab("bin width=20")+
ylab("")
bihar3
bihar4 <- ggplot(bihar_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="blue", color="darkblue", binwidth = 50)+
xlab("bin width=50")+
ylab("")
bihar4
plot_grid(bihar1, bihar2, bihar3, bihar4,
labels="Female Height in Bihar",
hjust=-1, vjust=0.2)#the function cowplot is used here
Source: US Data from National Health and Nutrition Examination Survey
us_data <- read_csv("US_sample_data.csv")
str(us_data)
tibble [9,813 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ X1 : num [1:9813] 1 2 3 4 5 6 7 8 9 10 ...
$ seqn : num [1:9813] 73557 73558 73559 73560 73561 ...
$ female : num [1:9813] 0 0 0 0 1 0 0 1 1 0 ...
$ adult : num [1:9813] 1 1 1 0 1 1 0 1 1 1 ...
$ age : num [1:9813] 69 54 72 9 73 56 0 61 56 65 ...
$ height_cm: num [1:9813] 171 177 175 137 162 ...
$ weight_kg: num [1:9813] 78.3 89.5 88.9 32.2 52 105 7.4 93.4 61.8 65.3 ...
- attr(*, "spec")=
.. cols(
.. X1 = col_double(),
.. seqn = col_double(),
.. female = col_double(),
.. adult = col_double(),
.. age = col_double(),
.. height_cm = col_double(),
.. weight_kg = col_double()
.. )
us_adult_females_trunc <- filter(us_data,female==1 , adult==1, height_cm>120 , height_cm<200)
str(us_adult_females_trunc)
tibble [2,969 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ X1 : num [1:2969] 5 8 9 11 16 22 24 38 39 46 ...
$ seqn : num [1:2969] 73561 73564 73566 73568 73574 ...
$ female : num [1:2969] 1 1 1 1 1 1 1 1 1 1 ...
$ adult : num [1:2969] 1 1 1 1 1 1 1 1 1 1 ...
$ age : num [1:2969] 73 61 56 26 33 38 23 57 50 69 ...
$ height_cm: num [1:2969] 162 162 153 152 158 ...
$ weight_kg: num [1:2969] 52 93.4 61.8 47.1 56.8 ...
- attr(*, "spec")=
.. cols(
.. X1 = col_double(),
.. seqn = col_double(),
.. female = col_double(),
.. adult = col_double(),
.. age = col_double(),
.. height_cm = col_double(),
.. weight_kg = col_double()
.. )
ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(fill="red", color="darkred")+
xlab("Height in centimeters, US females")
ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
geom_density(kernel="gaussian", aes(height_cm))
As we played around the Bin Width in Histogram, we can play around Bandwidth in Kernal Density Estimation, for example:
US1 <- ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
geom_density(kernel="gaussian", aes(height_cm), bw=1)+
xlab("bw=1")+
ylab("")
US2 <- ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
geom_density(kernel="gaussian", aes(height_cm), bw=5)+
xlab("bw=5")+
ylab("")
US3 <- ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
geom_density(kernel="gaussian", aes(height_cm), bw=10)+
xlab("bw=10")+
ylab("")
US4 <- ggplot(us_adult_females_trunc, aes(height_cm))+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="white" , color="darkred")+
geom_density(kernel="gaussian", aes(height_cm), bw=20)+
xlab("bw=20")+
ylab("")
plot_grid(US1, US2, US3, US4,
labels="Female Height in the US",
hjust=-1, vjust=0.2)
ggplot()+
geom_histogram(data=bihar_adult_females_trunc, aes(height_cm),fill="blue", color="darkblue" )+
geom_histogram(data=us_adult_females_trunc, aes(height_cm), fill="red", color="darkred" )
Looking at the histograms, one can mistakenly deduce that Women in India are comparatively taller than women in the US. Thus, this comparative histogram is deceiving. It is not good to use count on the Y-axis. We can rather use density, which makes more sense. Let’s try:
ggplot()+
geom_histogram(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ),fill="blue", color="darkblue")+
geom_histogram(data=us_adult_females_trunc, aes(height_cm , ..density..), fill="red", color="darkred" )+
xlab("Height in centimeters")
These histograms make a little more sense. However, there is some overlapping going on. We are not actually able to see what’s going on on the right hand side of the Bihar plot. So is may be wise to use something that shows the frequency points rather than the bars themselves. Let’s give a try:
ggplot()+
geom_freqpoly(data=bihar_adult_females_trunc, aes(height_cm, ..density.. ), color="darkblue" )+
geom_freqpoly(data=us_adult_females_trunc, aes(height_cm , ..density..), color="darkred" )+
xlab("Height in centimeters")
Amazing! Everything is visible, and we can have a whole some picture of height distribution of women in Bihar and the US, and how they compare.
ggplot()+
geom_density(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
geom_density(data=us_adult_females_trunc, aes(height_cm), color="darkred" )+
xlab("Height in centimeters")
We were seeing pointy tips and kinks in the previous plots, however, the tips in this plot seem smoother. It looks definitely better but it is always a trade-off between information in the data and the smoothness of the plots.
ggplot()+
stat_ecdf(data=bihar_adult_females_trunc, aes(height_cm), color="darkblue" )+
stat_ecdf(data=us_adult_females_trunc, aes(height_cm), color="darkred" )+
xlab("Height in centimeters")
The function ‘stat_ecdf’ helps us calculate Cumulative Distribution Function. It looks like the CDF is much better option in comparison. The CDF itself is the integral of the density.
Professor Duflo at MIT thinks that to some extent representing a distribution with a PDF versus a CDF is a matter of (A) choice and (B)what do we want to study.
With the PDF version of the plot we see better the fatness or symmetry, etc. In addition, the shape of one particular distribution is much more visible with a PDF than with a CDF. So, while I am visualizing the data data, we start with the PDF first. Then, when we want to compare to a distribution, then, personally, the professor prefers the CDF.