Comparison of numerics -6 * 14 != 17 - 101
Comparison of character strings “useR” == “user”
Compare a logical with a numeric TRUE == 1 TRUE == 1
Comparing Vectors
linkedin <- c(16, , 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
linkedin > 15
linkedin <= 5
When does views equal 13? views == 13
When is views less than or equal to 14? views <= 14
Is last under 5 or above 10? last < 5 | last > 10
Is last between 15 (exclusive) and 20 (inclusive)? last >15 & last <= 20
Select the second column, named day2, from li_df: second second <- li_df[ ,2]
speed <- 64
while (speed > 30 ) {
print(“Slow down!”)
speed = speed -7
}
speed
while (speed > 30
print(paste(“Your speed is”,speed))
if (speed > 48 ) {
print(“Slow down big time!”)
speed = speed - 11
print(“Slow down!”)
if (speed > 80 ) {
break
}
The linkedin vector has already been defined for you
for (i in linkedin)
{
print(i)
}
for (i in 1:length(linkedin)) {
print(linkedin[i])
}
The nyc list is already specified
nyc <- list(pop = 8405837, boroughs = c(“Manhattan”, “Bronx”, “Brooklyn”, “Queens”, “Staten Island”), capital = FALSE)
for (i in nyc) {
}
Consult the documentation on the mean() function help(mean)
Inspect the arguments of the mean() function args(mean)
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
avg_fb <- mean(facebook)
print(avg_li)
print(avg_fb)
avg_sum <- mean(linkedin + facebook)
avg_sum_trimmed <- mean(linkedin + facebook, trim = 0.2)
na.rm comes in when you have NA’s in your data
mean(abs(linkedin - facebook), na.rm = TRUE)
Create a function pow_two()
pow_two <- function(x) {
x ^ 2
}
pow_two(12)
sum_abs <- function(a, b) {
abs(a) + abs(b)
}
sum_abs(-2, 3)
gapminder %>% filter(year == 1957)
gapminder %>% filter(year == 2002, country == “China”)
filter(starwars, hair_color == “none” & eye_color == “black”)
filter(starwars, hair_color == “none” | eye_color == “black”)
gapminder %>% arrange(lifeExp)
gapminder %>% arrange(desc(lifeExp))
gapminder %>%
filter(year == 1957) %>%
arrange(desc(pop))
gapminder %>% mutate(lifeExp = 12 * lifeExp)
gapminder %>% mutate(lifeExpMonths = 12 * lifeExp)
gapminder %>%
filter(year == 2007) %>%
mutate(lifeExpMonths = 12 * lifeExp) %>%
arrange(desc(lifeExpMonths))
*Still using the gapminder function
*Create gapminder_1952
*put pop on the x-axis and gdpPercap on the y-axis
library(gapminder)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x = pop, y = gdpPercap)) +
geom_point()
*Change this plot to put the x-axis on a log scale
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
geom_point()+
scale_x_log10()
*to add a scale to y-axis you would just add scale_x_log10()
*adding color
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent)) +
geom_point() + scale_x_log10()
*size aesthetic
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent,
size = gdpPercap)) +
geom_point() +
scale_x_log10()
*Another plot examples with added factors including faceting
library(gapminder)
library(dplyr)
library(ggplot2)
# Scatter plot comparing gdpPercap and lifeExp, with color representing continent
# and size representing population, faceted by year
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) + geom_point() + scale_x_log10() + facet_wrap(~ year)
Using the gapminder set
Summarize to find the median life expectancy
gapminder %>% summarize(medianLifeExp=median(lifeExp))
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp))
maximum GDP per capita
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
*Now using group_by
*Find median life expectancy and maximum GDP per capita in each continent in 1957
gapminder %>%
group_by(continent) %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap =
max(gdpPercap))
*Group by multiple objects
gapminder %>% group_by(continent, year) %>% summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
by_year_continent <- gapminder %>%
group_by(year, continent) %>%
summarize(medianGdpPercap = median(gdpPercap))
ggplot(by_year_continent, aes(year, medianGdpPercap), color = continent) + geom_line() + expand_limits(y = 0)
Summarize the median gdpPercap by year and continent in 1952
by_continent <- gapminder %>%
filter(year == 1952) %>%
group_by(continent) %>%
summarize(medianGdpPercap = median(gdpPercap))
ggplot(by_continent, aes(x = continent, y = medianGdpPercap)) + geom_col()
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x = pop)) + geom_histogram()
gapminder_1952 <- gapminder %>% filter(year == 1952)
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
geom_boxplot() +
scale_y_log10() + ggtitle(“Comparing GDP per capita across continents”)
comics
library(ggplot2)
levels(comics$align)
levels(comics$gender)
table(comics$align, comicsgender)
*Drop level
comics <- comics %>%
filter(align != “Reformed Criminals”) %>%
droplevels()
library(ggplot2)
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = “dodge”)
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = “dodge”) +
theme(axis.text.x = element_text(angle = 90))
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = “fill”) +
ylab(“proportion”)
comics$align <- factor(comics&align,
levels = c(“Bad”, “Neutral”, “Good”))
Create a bar plot of gdpPercap by country
ggplot(oceania_1952, aes(x = country, y = gdpPercap)) + geom_col()
*dataset: midterm scores, section attendance (0/1)
head(midterm_scores)
question of interest
does going to section improve your score on the midterm?
hypotheses:
H0: mu_section - mu_nosection = 0
Ha: mu_section - mu_nosection ≠ 0
test stat: difference in means
split dataset
went_to_section <- midterm_scores[midterm_scores$section == 1,]
didnt_go_to_section <- midterm_scores[midterm_scores$section == 0,]
head(went_to_section)
head(didnt_go_to_section)
*initialize our test
nsims <- 10000
combined_scores <- c(went_to_section$midterm_score,
didnt_go_to_section$midterm_score)
combined_section <- c(went_to_section$section,
didnt_go_to_section$section)
diff_obs <- mean(went_to_section$midterm_score) -
mean(didnt_go_to_section$midterm_score)
diff_obs
diffs <- rep(NA, nsims)
for (i in 1:nsims) {
shuffled_labels <- sample(combined_section, replace = FALSE
) diffs[i] <- mean(combined_scores[shuffled_labels == 1]) -
mean(combined_scores[shuffled_labels == 0]) }
histogram
hist(diffs)
calculate the two-sided p-value
p-value = (number of more extreme differences than diff_obs)/nsims
length(diffs[abs(diffs) >= abs(diff_obs)])/nsims
mean(abs(diffs) > abs(diff_obs))
data = read.csv(“firearms.csv”, header=T, sep=“,”)
keeprows = data[,“OECD”]==“Y”
keeprows = keeprows & !(data[,“country”] == “United States”) &
!(data[,“country”] == “Mexico”)
data = data[keeprows,]
charts = matrix(numeric(0), dim(data)[1], 9)
realchart = sample(1:9, 1)
data[,“firearms”] = seq(1:dim(data)[1])
data[,“homicides”] = seq(1:dim(data)[1])
except for realchart
hints: the y values are data[, “homicides”] to set column i of charts, use charts[,i]=…
Now plot the charts in a grid par(mfrow = c(3, 3))
for (i in 1:9) {
plot(data[,"firearms"], charts[,i], xlab="", ylab="")
}
cat(“Press enter to reveal real chart n”)
readline()
cat(“Real data is in chart number”, realchart, “n”)
library(readxl)
library(dplyr)
library(ggplot2)
DF <- read_excel(“../Data/TMP.xlsx”)
head(DF)
DF$Age_Cohort <- gsub(42898, “6-12”, DF$Age_Cohort)
DF$Age_Cohort <- gsub(“0 - 5”, “0-5”, DF$Age_Cohort)
DF$Age_Cohort <- factor(DF$Age_Cohort, levels =
c(“0-5”,“6-12”,“13-17”,“18-21”,“22-50”,“51 +”))
table(DF$Age_Cohort)
DF %>%
group_by(Gender) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n=
n(), SD = sd(Expenditures))
ggplot(data = DF, aes(x = Gender, y = Expenditures, fill = Gender))
geom_boxplot() +
theme_bw() +
scale_fill_manual(values = c(“pink”, “blue”))
DF %>%
group_by(Gender) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n=
n()) %>%
ggplot(aes(x = Gender, y= ME, fill = Gender)) +
geom_bar(stat = “identity”) +
labs(title = “Average Expenditure by Gender”, y = “Mean
Expenditure“) +
theme_bw() +
scale_fill_manual(values = c(“pink”, “blue”))
DF %>%
filter(Ethnicity %in% c(“Hispanic”, “White not Hispanic”)) %>%
group_by(Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n())