Using for(i in x) is an efficient way to repeatedly execute a code for i number of iterations. In this recitation, we will (1) execute basic loops, (2) create basic functions and (3) make some plots using ggplot
Relevant functions: set.seed(), rnorm(), for(i in x), sample(), cat(), print(), replicate(), sqrt(), prod().
Load the lfpog.csv in R.
setwd("~/Google Drive/Penn/TA/Intro to DS/Rk_Recitation/Data")
lfp <- read.csv("lfpog.csv")
print(paste("The year is", 2010))
## [1] "The year is 2010"
print(paste("The year is", 2011))
## [1] "The year is 2011"
print(paste("The year is", 2012))
## [1] "The year is 2012"
print(paste("The year is", 2013))
## [1] "The year is 2013"
print(paste("The year is", 2014))
## [1] "The year is 2014"
print(paste("The year is", 2015))
## [1] "The year is 2015"
Instead of writing 6 lines of code iteratively, you can create a loop which will do this for you
for (year in c(2010,2011,2012,2013,2014,2015)){
print(paste("The year is", year))
}
## [1] "The year is 2010"
## [1] "The year is 2011"
## [1] "The year is 2012"
## [1] "The year is 2013"
## [1] "The year is 2014"
## [1] "The year is 2015"
rnorm()set.seed(150) # Setting the seed for replication purposes
myData <- rnorm(1000,45,15) # Creating a random normal distribution (n=1000, mean=45, sd=15)
length(), mean() and sd()length(myData) # How many observations?
## [1] 1000
mean(myData) # What is the mean?
## [1] 44.52407
sd(myData) # What is the standard deviation?
## [1] 14.85105
for(i in x)
sample() and print()set.seed(300) # Setting the seed for replication purposes
for (i in 1:5) # Specifying the number of iterations
{
obs <- sample(myData,size=1) # Sampling one observation from the myData vector and storing it into the "obs" object
print(obs) # Printing the value of that "obs" object
cat("I have finished", i,"iterations \n") # Printing a string of characters after each iteration
}
## [1] 44.06057
## I have finished 1 iterations
## [1] 32.19679
## I have finished 2 iterations
## [1] 34.89336
## I have finished 3 iterations
## [1] 41.51256
## I have finished 4 iterations
## [1] 63.42954
## I have finished 5 iterations
sample() and sqrt()set.seed(300) # Setting the seed for replication purposes
results <- c() # Creating an empty vector to hold the results
for (i in 1:5) # Specifying the number of iterations
{
obs <- sample(myData,size=1) # Sampling one observation from the myData vector and storing it into the "obs" object
results[i] <- (obs)^2 # Calculating the square root of that "obs" object and storing it into the "results" vector
cat("The square of", obs, "is", results[i],"\n") # Printing a string of characters after each iteration
}
## The square of 44.06057 is 1941.334
## The square of 32.19679 is 1036.633
## The square of 34.89336 is 1217.546
## The square of 41.51256 is 1723.292
## The square of 63.42954 is 4023.307
sample() and mean()
set.seed(300) # Setting the seed for replication purposes
results <- c() # Creating an empty vector to hold the results
for (i in 1:5)
{
obs <- sample(myData,size=2) # Sampling two observations from the myData vector and storing it into the "obs" object
results[i] <- mean(obs) # Calculating the sum of the elements encompassed within the "obs" object and storing it into the "results" vector
cat("The mean of", obs[1], "and", obs[2], "is", results[i],"\n") # Printing a string of characters after each iteration
}
## The mean of 44.06057 and 32.19679 is 38.12868
## The mean of 34.89336 and 41.51256 is 38.20296
## The mean of 63.42954 and 30.3382 is 46.88387
## The mean of 59.36095 and 32.10175 is 45.73135
## The mean of 59.5734 and 72.12265 is 65.84803
summary(results)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.13 38.20 45.73 46.96 46.88 65.85
Plotting the cumulative mean
plot(results,type="l",ylab="Cumulative Mean",xlab="",lwd=2,ylim=c(30,60))
abline(h=mean(results,na.rm=T),col="red",lty=2)
options(scipen=999) # turn off scientific notation like 1e+06
library(ggplot2)
# Init Ggplot
ggplot(lfp, aes(x=age, y=inc))
### 3.2 Creating a scatter plot
With these commands you have now created a blank graph with the x and y axis. Now, we need to add points/bars/lines to it.
library(ggplot2)
g<- ggplot(lfp, aes(x=age, y=inc)) + geom_point()
g
###### Now, you can also add a smoothin line to it
library(ggplot2)
g<- g + geom_smooth(method="lm")
g
### 3.3 Changing the axis limits and re-naming the axis
g2<- g + xlim(c(0, 100)) + ylim(c(0, 200)) + labs(title="Relationship between age and income", subtitle="From LFP survey", y="Income in 000s", x="Age of Respondents")
g2
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
### 3.4 Changing the theme of the graph
g3<- g + theme_bw() + labs(title="Relationship between age and income", subtitle="BW Theme", y="Income in 000s", x="Age of Respondents")
g4<- g + theme_classic() + labs(title="Relationship between age and income", subtitle="Classic Theme", y="Income in 000s", x="Age of Respondents")
g4
g3
g3 + geom_point(aes(col=age), size=3) + # Set color to vary based on age
geom_smooth(method="lm", col="firebrick", size=2)
# Changing the color of ALL the points to red irrespective of age
g3 + geom_point(col = "red", size=1.5) + # Set color to vary based on age
geom_smooth(method="lm", col="steelblue", size=2)
Let’s create the bar plot of the mean income of employed and unemployed women in the survey
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
lfp %>% group_by(lfp) %>% summarise(mean(inc,na.rm=T))
## # A tibble: 2 x 2
## lfp `mean(inc, na.rm = T)`
## <fct> <dbl>
## 1 no 21.7
## 2 yes 18.9
lfp.yn <- as.data.frame(lfp %>% group_by(lfp) %>% summarise(mean(inc,na.rm=T)))
colnames(lfp.yn) <- c("Labour_Force_Participation", "Mean_Income")
ggplot(lfp.yn, aes(Labour_Force_Participation,Mean_Income)) +
geom_bar(stat="identity", fill="steelblue",width=0.5)+
geom_text(aes(label=Mean_Income), vjust=-0.3, size=3.5)+
theme_minimal()
## coord flip
ggplot(lfp.yn, aes(Labour_Force_Participation,Mean_Income)) +
geom_bar(stat="identity", fill="steelblue",width=0.5)+
geom_text(aes(label=Mean_Income), vjust=-0.3, size=3.5)+
theme_minimal() + coord_flip()
Using the lfpog.csv data 4 samples of size 5 and calculate the mean income of these respondents.
Store this in a vector and then print them out in order
Calculate the cumulative mean and plot it
Create a bar plot of the mean income of respondents whose husbands work versus those who do not.