This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

Data manipulation using UNDP HDI Data

Read HDI (.csv) file

library(knitr)
file_url <- "https://raw.githubusercontent.com/amarnathbose/Datafiles/master/hdi2017_2.csv"
h <- read.csv(file_url)
kable(head(h))
rank country hdi lifeexp eyrsschool ayrsschool pcgni
1 Norway 0.953 82.3 17.9 12.6 68012
2 Switzerland 0.944 83.5 16.2 13.4 57625
3 Australia 0.939 83.1 22.9 12.9 43560
4 Ireland 0.938 81.6 19.6 12.5 53754
5 Germany 0.936 81.2 17.0 14.1 46136
6 Iceland 0.935 82.9 19.3 12.4 45810

Query: India’s position in HDI ranking

# What is India's rank in the 4 development attributes
which(h$country=='India')
## [1] 130
h[h$country=='India','hdi']
## [1] 0.64
rank(-h$hdi)[which(h$country=='India')]
## [1] 130

India’s rank is 130. This can also be seen by 130 and the HDI score is 0.64.

Query: India’s rank for 4 parameters, separately

paste('Life Exp rank = ',rank(-h$lifeexp)[which(h$country=='India')])
## [1] "Life Exp rank =  134"
paste('Exp Years Schooling rank = ',rank(-h$eyrsschool)[which(h$country=='India')])
## [1] "Exp Years Schooling rank =  128"
paste('Avg Years Schooling rank = ',rank(-h$ayrsschool)[which(h$country=='India')])
## [1] "Avg Years Schooling rank =  137"
paste('Per Capita GNI rank = ',rank(-h$pcgni)[which(h$country=='India')])
## [1] "Per Capita GNI rank =  124"

Investigation into Balanced Development

using 1. Life Expectancy 2. Expected and Average Years of Schooling 3. Per Capita GNI

l <- rank(-h$lifeexp)
e <- rank(-h$eyrsschool)
a <- rank(-h$ayrsschool)
p <- rank(-h$pcgni)
h <- cbind(h,l,e,a,p)
kable(head(h))
rank country hdi lifeexp eyrsschool ayrsschool pcgni l e a p
1 Norway 0.953 82.3 17.9 12.6 68012 14 8.5 14.5 6
2 Switzerland 0.944 83.5 16.2 13.4 57625 3 29.5 2.5 10
3 Australia 0.939 83.1 22.9 12.9 43560 7 1.0 7.5 21
4 Ireland 0.938 81.6 19.6 12.5 53754 21 3.0 17.5 12
5 Germany 0.936 81.2 17.0 14.1 46136 26 16.0 1.0 18
6 Iceland 0.935 82.9 19.3 12.4 45810 8 4.0 21.0 19

Plug in differences

for (i in 1:dim(h)[1])
  h[i,'d'] <- max(h[i,8:11])-min(h[i,8:11])
kable(head(h),caption='Top 6 countries')
Top 6 countries
rank country hdi lifeexp eyrsschool ayrsschool pcgni l e a p d
1 Norway 0.953 82.3 17.9 12.6 68012 14 8.5 14.5 6 8.5
2 Switzerland 0.944 83.5 16.2 13.4 57625 3 29.5 2.5 10 27.0
3 Australia 0.939 83.1 22.9 12.9 43560 7 1.0 7.5 21 20.0
4 Ireland 0.938 81.6 19.6 12.5 53754 21 3.0 17.5 12 18.0
5 Germany 0.936 81.2 17.0 14.1 46136 26 16.0 1.0 18 25.0
6 Iceland 0.935 82.9 19.3 12.4 45810 8 4.0 21.0 19 17.0

Top unbalanced development countries

# 10 countries with extreme unbalanced HDI indicators
h <- h[order(-h$d),]
kable(head(h[,c(2,3,8:12)],10))
country hdi l e a p d
141 Equatorial Guinea 0.591 180.5 170.5 150.0 61.0 119.5
56 Kuwait 0.803 83.0 86.0 121.5 5.0 116.5
112 Moldova (Republic of) 0.700 111.5 140.0 40.0 132.0 100.0
122 Kyrgyzstan 0.672 117.0 90.0 53.5 152.5 99.0
102 Maldives 0.717 45.5 117.5 140.0 81.0 94.5
71 Georgia 0.780 99.5 54.5 10.0 104.0 94.0
105 Uzbekistan 0.710 114.0 134.0 42.0 123.0 92.0
37 Qatar 0.856 39.0 90.0 77.0 1.0 89.0
59 Kazakhstan 0.800 125.5 50.5 36.5 55.0 89.0
114 South Africa 0.699 160.5 93.0 71.5 89.0 89.0

Top balanced development countries

# 10 countries with best balanced HDI indicators
h <- h[order(h$d),]
kable(head(h[,c(2,3,8:12)],10))
country hdi l e a p d
1 Norway 0.953 14.0 8.5 14.5 6 8.5
99 Saint Vincent and the Grenadines 0.723 101.0 93.0 101.5 98 8.5
111 Paraguay 0.702 102.5 111.5 105.0 107 9.0
174 Gambia 0.460 168.0 172.0 177.0 176 9.0
8 Sweden 0.933 11.0 10.5 21.0 16 10.5
89 Peru 0.750 78.5 81.5 90.0 91 12.5
130 India 0.640 134.0 128.0 137.0 124 13.0
46 Croatia 0.831 42.5 54.5 45.5 56 13.5
180 Mozambique 0.437 176.0 168.5 177.0 182 13.5
20 Austria 0.908 18.0 32.0 29.5 20 14.0

Monte Carlo Simulation of \(\pi\)

The algorithm

Consider a unit circle inscribed in a square. The ratio of the areas of the circle and the square is \(\frac{\pi}{4}\). Simulate a large number of points in the square. (With the centre of the square defined as (0,0) this is equivalent to generating random (x,y) where \(-1\leqslant x \leqslant +1\) and \(-1\leqslant y \leqslant +1\). The points inside the circle have the property \(x^2 + y^2 \leqslant 1\), since the circle has unit radius, with centre (0,0)

Generate 10,000 points - i.e. 10,000 (x,y) tuples

x <- runif(10000,-1,1)
y <- runif(10000,-1,1)

Compute \(\pi\)

z <- x^2 + y^2
pi_approx <- length(z[z<=1])*4/length(z)

The simulated value of \(\pi\) is 3.1656

Effect of varying the number of simulated points on the convergence of \(\pi\)

f <- function(ntrials) {
  x <- runif(ntrials,-1,1)
  y <- runif(ntrials,-1,1)
  z <- x^2 + y^2
  pia <- length(z[z<=1])*4/length(z)
  return(pia)
}
ntrials <- 10^seq(3,6,0.05)
pis <- sapply(ntrials, f)

Now we plot the convergence pattern for \(\pi\)

The value of \(\pi\) for the simulation with \(10^6\) points is 3.140924

library(ggplot2)
qplot(ntrials,pis,main='Monte Carlo simulation of pi',xlab='Trials',ylab='pi',geom=c("point", "line"))

An example of regression using R

Read the GenderDiscrimination data set

file_url <- "https://raw.githubusercontent.com/amarnathbose/Datafiles/master/GenderDiscrimination.csv"
g <- read.csv(file_url)
#g <- read.csv('GenderDiscrimination.csv',header=T)
kable(g[sample(1:dim(g)[1],6,replace=F),])
Gender Experience Salary
181 Male 23 91000
84 Female 10 71200
204 Male 39 148000
186 Male 16 94000
54 Female 9 82800
109 Female 14 79400
summary(g)
##     Gender      Experience        Salary      
##  Female:140   Min.   : 2.00   Min.   : 53400  
##  Male  : 68   1st Qu.: 7.00   1st Qu.: 66000  
##               Median :10.00   Median : 74000  
##               Mean   :12.05   Mean   : 79844  
##               3rd Qu.:16.00   3rd Qu.: 88000  
##               Max.   :39.00   Max.   :194000

Plot to see how salary relates to experience

qplot(Experience, Salary, data=g, main="Salary versus Experience")

## Superimpose the line of best fit on this scatterplot

options(repr.plot.width=6, repr.plot.height=5)
plot(g$Experience,g$Salary, main="Salary versus Experience")
l1 <- lm(Salary ~ Experience, data=g)
abline(l1)

## Examine the best fit linear model

names(summary(l1))
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"
l1$coefficients
## (Intercept)  Experience 
##   59033.075    1727.311
summary(l1)$r.squared
## [1] 0.3149884
summary(l1)$coefficients
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 59033.075  2499.8476 23.614670 1.592315e-60
## Experience   1727.311   177.4755  9.732669 1.168931e-18

Model Salary on Experience as well as Gender

The scatter plot - gender differentiated

qplot(Experience, Salary, data=g, geom='point', color=Gender, main="Salary versus Experience - Gender Differentiated")

### Fitting the new Gender-differentiated model

g <- cbind(g,Female=ifelse(as.numeric(g$Gender)==1,1,0))
l2 <- lm(Salary ~ Experience + Experience*Female + Female, data=g)
summary(l2)
## 
## Call:
## lm(formula = Salary ~ Experience + Experience * Female + Female, 
##     data = g)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71048  -9278  -1701   9166  47932 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        58299.3     2998.6  19.442  < 2e-16 ***
## Experience          2753.0      199.8  13.781  < 2e-16 ***
## Female              8034.3     4110.6   1.955    0.052 .  
## Experience:Female  -2086.2      287.3  -7.261 7.95e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15110 on 204 degrees of freedom
## Multiple R-squared:  0.5561, Adjusted R-squared:  0.5495 
## F-statistic: 85.18 on 3 and 204 DF,  p-value: < 2.2e-16

The plot and the model parameters

options(repr.plot.width=6, repr.plot.height=4)
plot(g$Experience,g$Salary, col=ifelse(as.numeric(g$Gender)==1,"blue","red"), 
     #pch=ifelse(as.numeric(g$Gender)==1,1,19), 
     pch=19,
     main="Salary versus Experience - Gender Differentiated",xlab="Experience in years",ylab="Salary")
legend("topleft", c("Female", "Male"), col=c("blue", "red"),lty=1)
abline(l2$coefficients[1]+l2$coefficients[3],l2$coefficients[2]+l2$coefficients[4],col="blue")
abline(l2$coefficients[1],l2$coefficients[2],col="red")

summary(l2)
## 
## Call:
## lm(formula = Salary ~ Experience + Experience * Female + Female, 
##     data = g)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71048  -9278  -1701   9166  47932 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        58299.3     2998.6  19.442  < 2e-16 ***
## Experience          2753.0      199.8  13.781  < 2e-16 ***
## Female              8034.3     4110.6   1.955    0.052 .  
## Experience:Female  -2086.2      287.3  -7.261 7.95e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15110 on 204 degrees of freedom
## Multiple R-squared:  0.5561, Adjusted R-squared:  0.5495 
## F-statistic: 85.18 on 3 and 204 DF,  p-value: < 2.2e-16

A question of aesthetics - ggplot

options(repr.plot.width=6, repr.plot.height=3.5)
ggplot(g, aes(x=Experience, y=Salary, color=Gender)) + geom_point() + geom_smooth(method='lm')

Finally, the model linear equations

  1. For males, Salary = 58299 + 2753 Experience
  2. For females, Salary = 66334 + 667 Experience

Examples and Exercises

A function to return multiples of 3

# function to return multiples of 3
m3 <- function(f=1, t) {
  mul <- c()
  for (i in f:t) {
    k <- i%%3
    if (k==0)
      mul <- c(mul,i)
  }
  return(mul)
}

Invoke the function

j <- m3(,15) # m3(to=15)
j
## [1]  3  6  9 12 15

Now, a function to return multiples of an arbitrary integer, passed as a function argument

# function to return multiples of an arbitrary integer, m
mm <- function(f, t, m) {
  mul <- c()
  for (i in f:t) {
    k <- i%%m
    if (k==0)
      mul <- c(mul,i)
  }
  return(mul)
}

Invoke the improved function

m3(13,29)
## [1] 15 18 21 24 27
mm(13,29,3)
## [1] 15 18 21 24 27
mm(12,200,17)
##  [1]  17  34  51  68  85 102 119 136 153 170 187