This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(knitr)
file_url <- "https://raw.githubusercontent.com/amarnathbose/Datafiles/master/hdi2017_2.csv"
h <- read.csv(file_url)
kable(head(h))
| rank | country | hdi | lifeexp | eyrsschool | ayrsschool | pcgni |
|---|---|---|---|---|---|---|
| 1 | Norway | 0.953 | 82.3 | 17.9 | 12.6 | 68012 |
| 2 | Switzerland | 0.944 | 83.5 | 16.2 | 13.4 | 57625 |
| 3 | Australia | 0.939 | 83.1 | 22.9 | 12.9 | 43560 |
| 4 | Ireland | 0.938 | 81.6 | 19.6 | 12.5 | 53754 |
| 5 | Germany | 0.936 | 81.2 | 17.0 | 14.1 | 46136 |
| 6 | Iceland | 0.935 | 82.9 | 19.3 | 12.4 | 45810 |
# What is India's rank in the 4 development attributes
which(h$country=='India')
## [1] 130
h[h$country=='India','hdi']
## [1] 0.64
rank(-h$hdi)[which(h$country=='India')]
## [1] 130
India’s rank is 130. This can also be seen by 130 and the HDI score is 0.64.
paste('Life Exp rank = ',rank(-h$lifeexp)[which(h$country=='India')])
## [1] "Life Exp rank = 134"
paste('Exp Years Schooling rank = ',rank(-h$eyrsschool)[which(h$country=='India')])
## [1] "Exp Years Schooling rank = 128"
paste('Avg Years Schooling rank = ',rank(-h$ayrsschool)[which(h$country=='India')])
## [1] "Avg Years Schooling rank = 137"
paste('Per Capita GNI rank = ',rank(-h$pcgni)[which(h$country=='India')])
## [1] "Per Capita GNI rank = 124"
using 1. Life Expectancy 2. Expected and Average Years of Schooling 3. Per Capita GNI
l <- rank(-h$lifeexp)
e <- rank(-h$eyrsschool)
a <- rank(-h$ayrsschool)
p <- rank(-h$pcgni)
h <- cbind(h,l,e,a,p)
kable(head(h))
| rank | country | hdi | lifeexp | eyrsschool | ayrsschool | pcgni | l | e | a | p |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Norway | 0.953 | 82.3 | 17.9 | 12.6 | 68012 | 14 | 8.5 | 14.5 | 6 |
| 2 | Switzerland | 0.944 | 83.5 | 16.2 | 13.4 | 57625 | 3 | 29.5 | 2.5 | 10 |
| 3 | Australia | 0.939 | 83.1 | 22.9 | 12.9 | 43560 | 7 | 1.0 | 7.5 | 21 |
| 4 | Ireland | 0.938 | 81.6 | 19.6 | 12.5 | 53754 | 21 | 3.0 | 17.5 | 12 |
| 5 | Germany | 0.936 | 81.2 | 17.0 | 14.1 | 46136 | 26 | 16.0 | 1.0 | 18 |
| 6 | Iceland | 0.935 | 82.9 | 19.3 | 12.4 | 45810 | 8 | 4.0 | 21.0 | 19 |
for (i in 1:dim(h)[1])
h[i,'d'] <- max(h[i,8:11])-min(h[i,8:11])
kable(head(h),caption='Top 6 countries')
| rank | country | hdi | lifeexp | eyrsschool | ayrsschool | pcgni | l | e | a | p | d |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Norway | 0.953 | 82.3 | 17.9 | 12.6 | 68012 | 14 | 8.5 | 14.5 | 6 | 8.5 |
| 2 | Switzerland | 0.944 | 83.5 | 16.2 | 13.4 | 57625 | 3 | 29.5 | 2.5 | 10 | 27.0 |
| 3 | Australia | 0.939 | 83.1 | 22.9 | 12.9 | 43560 | 7 | 1.0 | 7.5 | 21 | 20.0 |
| 4 | Ireland | 0.938 | 81.6 | 19.6 | 12.5 | 53754 | 21 | 3.0 | 17.5 | 12 | 18.0 |
| 5 | Germany | 0.936 | 81.2 | 17.0 | 14.1 | 46136 | 26 | 16.0 | 1.0 | 18 | 25.0 |
| 6 | Iceland | 0.935 | 82.9 | 19.3 | 12.4 | 45810 | 8 | 4.0 | 21.0 | 19 | 17.0 |
# 10 countries with extreme unbalanced HDI indicators
h <- h[order(-h$d),]
kable(head(h[,c(2,3,8:12)],10))
| country | hdi | l | e | a | p | d | |
|---|---|---|---|---|---|---|---|
| 141 | Equatorial Guinea | 0.591 | 180.5 | 170.5 | 150.0 | 61.0 | 119.5 |
| 56 | Kuwait | 0.803 | 83.0 | 86.0 | 121.5 | 5.0 | 116.5 |
| 112 | Moldova (Republic of) | 0.700 | 111.5 | 140.0 | 40.0 | 132.0 | 100.0 |
| 122 | Kyrgyzstan | 0.672 | 117.0 | 90.0 | 53.5 | 152.5 | 99.0 |
| 102 | Maldives | 0.717 | 45.5 | 117.5 | 140.0 | 81.0 | 94.5 |
| 71 | Georgia | 0.780 | 99.5 | 54.5 | 10.0 | 104.0 | 94.0 |
| 105 | Uzbekistan | 0.710 | 114.0 | 134.0 | 42.0 | 123.0 | 92.0 |
| 37 | Qatar | 0.856 | 39.0 | 90.0 | 77.0 | 1.0 | 89.0 |
| 59 | Kazakhstan | 0.800 | 125.5 | 50.5 | 36.5 | 55.0 | 89.0 |
| 114 | South Africa | 0.699 | 160.5 | 93.0 | 71.5 | 89.0 | 89.0 |
# 10 countries with best balanced HDI indicators
h <- h[order(h$d),]
kable(head(h[,c(2,3,8:12)],10))
| country | hdi | l | e | a | p | d | |
|---|---|---|---|---|---|---|---|
| 1 | Norway | 0.953 | 14.0 | 8.5 | 14.5 | 6 | 8.5 |
| 99 | Saint Vincent and the Grenadines | 0.723 | 101.0 | 93.0 | 101.5 | 98 | 8.5 |
| 111 | Paraguay | 0.702 | 102.5 | 111.5 | 105.0 | 107 | 9.0 |
| 174 | Gambia | 0.460 | 168.0 | 172.0 | 177.0 | 176 | 9.0 |
| 8 | Sweden | 0.933 | 11.0 | 10.5 | 21.0 | 16 | 10.5 |
| 89 | Peru | 0.750 | 78.5 | 81.5 | 90.0 | 91 | 12.5 |
| 130 | India | 0.640 | 134.0 | 128.0 | 137.0 | 124 | 13.0 |
| 46 | Croatia | 0.831 | 42.5 | 54.5 | 45.5 | 56 | 13.5 |
| 180 | Mozambique | 0.437 | 176.0 | 168.5 | 177.0 | 182 | 13.5 |
| 20 | Austria | 0.908 | 18.0 | 32.0 | 29.5 | 20 | 14.0 |
Consider a unit circle inscribed in a square. The ratio of the areas of the circle and the square is \(\frac{\pi}{4}\). Simulate a large number of points in the square. (With the centre of the square defined as (0,0) this is equivalent to generating random (x,y) where \(-1\leqslant x \leqslant +1\) and \(-1\leqslant y \leqslant +1\). The points inside the circle have the property \(x^2 + y^2 \leqslant 1\), since the circle has unit radius, with centre (0,0)
x <- runif(10000,-1,1)
y <- runif(10000,-1,1)
z <- x^2 + y^2
pi_approx <- length(z[z<=1])*4/length(z)
The simulated value of \(\pi\) is 3.1656
f <- function(ntrials) {
x <- runif(ntrials,-1,1)
y <- runif(ntrials,-1,1)
z <- x^2 + y^2
pia <- length(z[z<=1])*4/length(z)
return(pia)
}
ntrials <- 10^seq(3,6,0.05)
pis <- sapply(ntrials, f)
The value of \(\pi\) for the simulation with \(10^6\) points is 3.140924
library(ggplot2)
qplot(ntrials,pis,main='Monte Carlo simulation of pi',xlab='Trials',ylab='pi',geom=c("point", "line"))
file_url <- "https://raw.githubusercontent.com/amarnathbose/Datafiles/master/GenderDiscrimination.csv"
g <- read.csv(file_url)
#g <- read.csv('GenderDiscrimination.csv',header=T)
kable(g[sample(1:dim(g)[1],6,replace=F),])
| Gender | Experience | Salary | |
|---|---|---|---|
| 181 | Male | 23 | 91000 |
| 84 | Female | 10 | 71200 |
| 204 | Male | 39 | 148000 |
| 186 | Male | 16 | 94000 |
| 54 | Female | 9 | 82800 |
| 109 | Female | 14 | 79400 |
summary(g)
## Gender Experience Salary
## Female:140 Min. : 2.00 Min. : 53400
## Male : 68 1st Qu.: 7.00 1st Qu.: 66000
## Median :10.00 Median : 74000
## Mean :12.05 Mean : 79844
## 3rd Qu.:16.00 3rd Qu.: 88000
## Max. :39.00 Max. :194000
qplot(Experience, Salary, data=g, main="Salary versus Experience")
## Superimpose the line of best fit on this scatterplot
options(repr.plot.width=6, repr.plot.height=5)
plot(g$Experience,g$Salary, main="Salary versus Experience")
l1 <- lm(Salary ~ Experience, data=g)
abline(l1)
## Examine the best fit linear model
names(summary(l1))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
l1$coefficients
## (Intercept) Experience
## 59033.075 1727.311
summary(l1)$r.squared
## [1] 0.3149884
summary(l1)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59033.075 2499.8476 23.614670 1.592315e-60
## Experience 1727.311 177.4755 9.732669 1.168931e-18
qplot(Experience, Salary, data=g, geom='point', color=Gender, main="Salary versus Experience - Gender Differentiated")
### Fitting the new Gender-differentiated model
g <- cbind(g,Female=ifelse(as.numeric(g$Gender)==1,1,0))
l2 <- lm(Salary ~ Experience + Experience*Female + Female, data=g)
summary(l2)
##
## Call:
## lm(formula = Salary ~ Experience + Experience * Female + Female,
## data = g)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71048 -9278 -1701 9166 47932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58299.3 2998.6 19.442 < 2e-16 ***
## Experience 2753.0 199.8 13.781 < 2e-16 ***
## Female 8034.3 4110.6 1.955 0.052 .
## Experience:Female -2086.2 287.3 -7.261 7.95e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15110 on 204 degrees of freedom
## Multiple R-squared: 0.5561, Adjusted R-squared: 0.5495
## F-statistic: 85.18 on 3 and 204 DF, p-value: < 2.2e-16
options(repr.plot.width=6, repr.plot.height=4)
plot(g$Experience,g$Salary, col=ifelse(as.numeric(g$Gender)==1,"blue","red"),
#pch=ifelse(as.numeric(g$Gender)==1,1,19),
pch=19,
main="Salary versus Experience - Gender Differentiated",xlab="Experience in years",ylab="Salary")
legend("topleft", c("Female", "Male"), col=c("blue", "red"),lty=1)
abline(l2$coefficients[1]+l2$coefficients[3],l2$coefficients[2]+l2$coefficients[4],col="blue")
abline(l2$coefficients[1],l2$coefficients[2],col="red")
summary(l2)
##
## Call:
## lm(formula = Salary ~ Experience + Experience * Female + Female,
## data = g)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71048 -9278 -1701 9166 47932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58299.3 2998.6 19.442 < 2e-16 ***
## Experience 2753.0 199.8 13.781 < 2e-16 ***
## Female 8034.3 4110.6 1.955 0.052 .
## Experience:Female -2086.2 287.3 -7.261 7.95e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15110 on 204 degrees of freedom
## Multiple R-squared: 0.5561, Adjusted R-squared: 0.5495
## F-statistic: 85.18 on 3 and 204 DF, p-value: < 2.2e-16
options(repr.plot.width=6, repr.plot.height=3.5)
ggplot(g, aes(x=Experience, y=Salary, color=Gender)) + geom_point() + geom_smooth(method='lm')
# function to return multiples of 3
m3 <- function(f=1, t) {
mul <- c()
for (i in f:t) {
k <- i%%3
if (k==0)
mul <- c(mul,i)
}
return(mul)
}
j <- m3(,15) # m3(to=15)
j
## [1] 3 6 9 12 15
# function to return multiples of an arbitrary integer, m
mm <- function(f, t, m) {
mul <- c()
for (i in f:t) {
k <- i%%m
if (k==0)
mul <- c(mul,i)
}
return(mul)
}
m3(13,29)
## [1] 15 18 21 24 27
mm(13,29,3)
## [1] 15 18 21 24 27
mm(12,200,17)
## [1] 17 34 51 68 85 102 119 136 153 170 187