Conditionals

Comparing data

Comparison of numerics -6 * 14 != 17 - 101

Comparison of character strings “useR” == “user”

Compare a logical with a numeric TRUE == 1 TRUE == 1

Comparing Vectors

  • linkedin <- c(16, , 13, 5, 2, 17, 14)

  • facebook <- c(17, 7, 5, 16, 8, 13, 14)

linkedin > 15

linkedin <= 5

When does views equal 13? views == 13

When is views less than or equal to 14? views <= 14

Is last under 5 or above 10? last < 5 | last > 10

Is last between 15 (exclusive) and 20 (inclusive)? last >15 & last <= 20

Select the second column, named day2, from li_df: second second <- li_df[ ,2]

Loops

speed <- 64

while (speed > 30 ) {

print(“Slow down!”)

speed = speed -7

}

speed

while (speed > 30

print(paste(“Your speed is”,speed))

if (speed > 48 ) {

print(“Slow down big time!”)

speed = speed - 11

print(“Slow down!”)

if (speed > 80 ) {

break

}

The linkedin vector has already been defined for you

Loop version 1

for (i in linkedin)

{

print(i)

}

Loop version 2

for (i in 1:length(linkedin)) {

print(linkedin[i])

}

Loop over a list

The nyc list is already specified

nyc <- list(pop = 8405837, boroughs = c(“Manhattan”, “Bronx”, “Brooklyn”, “Queens”, “Staten Island”), capital = FALSE)

Loop version 1

for (i in nyc) {

print

}

Functions

linkedin <- c(16, 9, 13, 5, 2, 17, 14)

facebook <- c(17, 7, 5, 16, 8, 13, 14)

avg_fb <- mean(facebook)

print(avg_li)

print(avg_fb)

avg_sum <- mean(linkedin + facebook)

avg_sum_trimmed <- mean(linkedin + facebook, trim = 0.2)

na.rm comes in when you have NA’s in your data

mean(abs(linkedin - facebook), na.rm = TRUE)

Creating Functions

Create a function pow_two()

pow_two <- function(x) {

x ^ 2

}

  • Use the function

pow_two(12)

  • Create a function sum_abs()

sum_abs <- function(a, b) {

abs(a) + abs(b)

}

sum_abs(-2, 3)

Tidyverse

Using the filter function

  • Filter the gapminder dataset for the year 1957

gapminder %>% filter(year == 1957)

  • Filter for China in 2002

gapminder %>% filter(year == 2002, country == “China”)

  • Using multiple criteria for this Star Wars set “and” and “or”

filter(starwars, hair_color == “none” & eye_color == “black”)

filter(starwars, hair_color == “none” | eye_color == “black”)

Sorting

  • Sort in ascending order of lifeExp (using th gapminder dataset)

gapminder %>% arrange(lifeExp)

  • Sort in descending order of lifeExp

gapminder %>% arrange(desc(lifeExp))

  • Filter for the year 1957, then arrange in descending order of population

gapminder %>%

filter(year == 1957) %>%

arrange(desc(pop))

Mutate

  • Use mutate to change lifeExp to be in months

gapminder %>% mutate(lifeExp = 12 * lifeExp)

  • Use mutate to create a new column called lifeExpMonths

gapminder %>% mutate(lifeExpMonths = 12 * lifeExp)

  • Filter, mutate, and arrange the gapminder dataset

gapminder %>%

filter(year == 2007) %>%

mutate(lifeExpMonths = 12 * lifeExp) %>%

arrange(desc(lifeExpMonths))

ggplot2

*Still using the gapminder function

*Create gapminder_1952

*put pop on the x-axis and gdpPercap on the y-axis

library(gapminder)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)


gapminder_1952 <- gapminder %>% 
  


filter(year == 1952)



ggplot(gapminder_1952, aes(x = pop, y = gdpPercap)) +
  geom_point()

*Change this plot to put the x-axis on a log scale

ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +

geom_point()+

scale_x_log10()

*to add a scale to y-axis you would just add scale_x_log10()

*adding color

ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent)) +

geom_point() + scale_x_log10()

*size aesthetic

ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent,

size = gdpPercap)) +

geom_point() +

scale_x_log10()

*Another plot examples with added factors including faceting

library(gapminder)
library(dplyr)
library(ggplot2)

# Scatter plot comparing gdpPercap and lifeExp, with color representing continent
# and size representing population, faceted by year
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) + geom_point() + scale_x_log10() + facet_wrap(~ year)

Summarize

gapminder %>% summarize(medianLifeExp=median(lifeExp))

gapminder %>%

filter(year == 1957) %>%

summarize(medianLifeExp = median(lifeExp))

maximum GDP per capita

gapminder %>%

filter(year == 1957) %>%

summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))

*Now using group_by

*Find median life expectancy and maximum GDP per capita in each continent in 1957

gapminder %>%

group_by(continent) %>%

filter(year == 1957) %>%

summarize(medianLifeExp = median(lifeExp), maxGdpPercap =

max(gdpPercap))

*Group by multiple objects

gapminder %>% group_by(continent, year) %>% summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))

Line Graph

by_year_continent <- gapminder %>%
group_by(year, continent) %>%
summarize(medianGdpPercap = median(gdpPercap))




ggplot(by_year_continent, aes(year, medianGdpPercap), color = continent) + geom_line() + expand_limits(y = 0)

Bar Graph

Summarize the median gdpPercap by year and continent in 1952

by_continent <- gapminder %>%

filter(year == 1952) %>%

group_by(continent) %>%

summarize(medianGdpPercap = median(gdpPercap))

  • Create a bar plot showing medianGdp by continent

ggplot(by_continent, aes(x = continent, y = medianGdpPercap)) + geom_col()

Histogram

gapminder_1952 <- gapminder %>%

filter(year == 1952)

  • Create a histogram of population (pop)

ggplot(gapminder_1952, aes(x = pop)) + geom_histogram()

Boxplots and Adding a Title

gapminder_1952 <- gapminder %>% filter(year == 1952)

  • Add a title to this graph: “Comparing GDP per capita across continents”

ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +

geom_boxplot() +

scale_y_log10() + ggtitle(“Comparing GDP per capita across continents”)

Exploring Data

Contingency Table example

comics

library(ggplot2)

  • Check levels of align

levels(comics$align)

  • Check the levels of gender

levels(comics$gender)

  • Create a 2-way contingency table

table(comics$align, comicsgender)

*Drop level

comics <- comics %>%

filter(align != “Reformed Criminals”) %>%

droplevels()

Side By Side Bar Chart

  • Load ggplot2

library(ggplot2)

  • Create side-by-side barchart of gender by alignment

ggplot(comics, aes(x = align, fill = gender)) +

geom_bar(position = “dodge”)

  • Create side-by-side barchart of alignment by gender

ggplot(comics, aes(x = gender, fill = align)) +

geom_bar(position = “dodge”) +

theme(axis.text.x = element_text(angle = 90))

Counts vs Proportions

  • Plot of gender by align

ggplot(comics, aes(x = align, fill = gender)) +

geom_bar()

  • Plot proportion of gender, conditional on align

ggplot(comics, aes(x = align, fill = gender)) +

geom_bar(position = “fill”) +

ylab(“proportion”)

Changing the orders of level example

comics$align <- factor(comics&align,

levels = c(“Bad”, “Neutral”, “Good”))

Bar Plot

Create a bar plot of gdpPercap by country

ggplot(oceania_1952, aes(x = country, y = gdpPercap)) + geom_col()

Permutation Test in R Using the Midterm Problem

*dataset: midterm scores, section attendance (0/1)

head(midterm_scores)

went_to_section <- midterm_scores[midterm_scores$section == 1,]

didnt_go_to_section <- midterm_scores[midterm_scores$section == 0,]

head(went_to_section)

head(didnt_go_to_section)

*initialize our test

nsims <- 10000

combined_scores <- c(went_to_section$midterm_score,

didnt_go_to_section$midterm_score)

combined_section <- c(went_to_section$section,

didnt_go_to_section$section)

diff_obs <- mean(went_to_section$midterm_score) -

mean(didnt_go_to_section$midterm_score)

diff_obs

diffs <- rep(NA, nsims)

for (i in 1:nsims) {

shuffled_labels <- sample(combined_section, replace = FALSE

) diffs[i] <- mean(combined_scores[shuffled_labels == 1]) -

mean(combined_scores[shuffled_labels == 0]) }

histogram

hist(diffs)

length(diffs[abs(diffs) >= abs(diff_obs)])/nsims

mean(abs(diffs) > abs(diff_obs))

Another Permutation/Randomization Test Using Countries

data = read.csv(“firearms.csv”, header=T, sep=“,”)

keeprows = data[,“OECD”]==“Y”

keeprows = keeprows & !(data[,“country”] == “United States”) &

!(data[,“country”] == “Mexico”)

data = data[keeprows,]

charts = matrix(numeric(0), dim(data)[1], 9)

realchart = sample(1:9, 1)

data[,“firearms”] = seq(1:dim(data)[1])

data[,“homicides”] = seq(1:dim(data)[1])

except for realchart

for (i in 1:9) {

plot(data[,"firearms"], charts[,i], xlab="", ylab="")   

}

cat(“Press enter to reveal real chart n”)

readline()

cat(“Real data is in chart number”, realchart, “n”)

Characterization Test Using the Discrimination Problem

Loading Data

library(readxl)

library(dplyr)

library(ggplot2)

DF <- read_excel(“../Data/TMP.xlsx”)

head(DF)

DF$Age_Cohort <- gsub(42898, “6-12”, DF$Age_Cohort)

DF$Age_Cohort <- gsub(“0 - 5”, “0-5”, DF$Age_Cohort)

DF$Age_Cohort <- factor(DF$Age_Cohort, levels =

c(“0-5”,“6-12”,“13-17”,“18-21”,“22-50”,“51 +”))

table(DF$Age_Cohort)

Making a Boxplot out of the Data

DF %>%

group_by(Gender) %>%

summarize(ME = mean(Expenditures), MDE = median(Expenditures), n=

n(), SD = sd(Expenditures))

ggplot(data = DF, aes(x = Gender, y = Expenditures, fill = Gender))

geom_boxplot() +

theme_bw() +

scale_fill_manual(values = c(“pink”, “blue”))

Creating a Bar Graph out of the data

DF %>%

group_by(Gender) %>%

summarize(ME = mean(Expenditures), MDE = median(Expenditures), n=

n()) %>%

ggplot(aes(x = Gender, y= ME, fill = Gender)) +

geom_bar(stat = “identity”) +

labs(title = “Average Expenditure by Gender”, y = “Mean

Expenditure“) +

theme_bw() +

scale_fill_manual(values = c(“pink”, “blue”))

Filtering The Values

DF %>%

filter(Ethnicity %in% c(“Hispanic”, “White not Hispanic”)) %>%

group_by(Ethnicity) %>%

summarize(ME = mean(Expenditures), n = n())