words data setThe words vector ships with the stringr
package.
length(words)
## [1] 980
head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"
four <- words[str_detect(words, "^.{4}$")]
length(four)
## [1] 263
head(four, 20)
## [1] "able" "also" "area" "away" "baby" "back" "ball" "bank" "base" "bear"
## [11] "beat" "best" "bill" "blow" "blue" "boat" "body" "book" "both" "busy"
four_five <- words[str_detect(words, "^.{4,5}$")]
length(four_five)
## [1] 463
head(four_five, 20)
## [1] "able" "about" "admit" "after" "again" "agent" "agree" "allow" "along"
## [10] "also" "apart" "apply" "area" "argue" "aware" "away" "awful" "baby"
## [19] "back" "ball"
second_st <- words[str_detect(words, "^.[st]")]
length(second_st)
## [1] 38
head(second_st, 20)
## [1] "as" "ask" "associate" "assume" "at" "attend"
## [7] "especial" "issue" "it" "item" "other" "otherwise"
## [13] "staff" "stage" "stairs" "stand" "standard" "start"
## [19] "state" "station"
oxx <- words[str_detect(words, "(.)(.)\\2")]
length(oxx)
## [1] 157
head(oxx, 20)
## [1] "accept" "account" "across" "add" "address"
## [6] "affect" "afford" "afternoon" "agree" "all"
## [11] "allow" "apparent" "appear" "apply" "appoint"
## [16] "approach" "appropriate" "arrange" "associate" "assume"
aeo <- words[str_detect(words, "a") &
str_detect(words, "e") &
str_detect(words, "o")]
aeo
## [1] "absolute" "afternoon" "another" "appropriate" "associate"
## [6] "colleague" "compare" "encourage" "operate" "organize"
## [11] "probable" "programme" "reason" "relation"
sentences data setlength(sentences)
## [1] 720
head(sentences, 3)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
sent_df <- tibble(sentence = sentences) %>%
mutate(has_the = str_detect(sentence, "\\b[Tt]he\\b"))
ggplot(sent_df, aes(x = has_the, fill = has_the)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.3) +
scale_x_discrete(labels = c("FALSE" = "Without 'the'",
"TRUE" = "With 'the'")) +
labs(title = "Sentences containing 'the' (or 'The')",
x = NULL, y = "Count") +
theme_minimal() +
theme(legend.position = "none")
sent_scatter <- tibble(sentence = sentences) %>%
mutate(
word_list = str_extract_all(sentence, "[A-Za-z]+"),
avg_word_len = map_dbl(word_list, ~ mean(str_length(.x))),
n_vowel_start = map_int(word_list,
~ sum(str_detect(.x, "^[aeiouAEIOU]")))
)
ggplot(sent_scatter, aes(avg_word_len, n_vowel_start)) +
geom_point(alpha = 0.5, colour = "steelblue") +
geom_smooth(method = "lm", se = FALSE, colour = "tomato") +
labs(title = "Sentence structure: avg word length vs. vowel-starting words",
x = "Average word length in the sentence",
y = "Number of words starting with a vowel") +
theme_minimal()
oed_raw <- read_lines("Oxford_English_Dictionary.txt")
oed <- tibble(line = oed_raw) %>%
filter(str_detect(line, "\\S")) # keep only lines containing non-whitespace
nrow(oed)
## [1] 36692
head(oed, 5)
## # A tibble: 5 × 1
## line
## <chr>
## 1 "A "
## 2 "A- prefix (also an- before a vowel sound) not, without (amoral). [greek]"
## 3 "Aa abbr. 1 automobile association. 2 alcoholics anonymous. 3 anti-aircraft."
## 4 "Aardvark n. Mammal with a tubular snout and a long tongue, feeding on termi…
## 5 "Ab- prefix off, away, from (abduct). [latin]"
wordsoed <- oed %>%
mutate(words = str_extract(line, "^\\S+"))
oed %>% select(words, line) %>% head(10)
## # A tibble: 10 × 2
## words line
## <chr> <chr>
## 1 A "A "
## 2 A- "A- prefix (also an- before a vowel sound) not, without (amoral).…
## 3 Aa "Aa abbr. 1 automobile association. 2 alcoholics anonymous. 3 ant…
## 4 Aardvark "Aardvark n. Mammal with a tubular snout and a long tongue, feedi…
## 5 Ab- "Ab- prefix off, away, from (abduct). [latin]"
## 6 Aback "Aback adv. \u007f take aback surprise, disconcert. [old english:…
## 7 Abacus "Abacus n. (pl. -cuses) 1 frame with wires along which beads are …
## 8 Abaft "Abaft naut. —adv. In the stern half of a ship. —prep. Nearer the…
## 9 Abandon "Abandon —v. 1 give up. 2 forsake, desert. 3 (often foll. By to; …
## 10 Abandoned "Abandoned adj. 1 deserted, forsaken. 2 unrestrained, profligate."
all_vowels_y <- oed %>%
mutate(w = str_to_lower(words)) %>%
filter(str_detect(w, "a"),
str_detect(w, "e"),
str_detect(w, "i"),
str_detect(w, "o"),
str_detect(w, "u"),
str_detect(w, "y")) %>%
distinct(words)
all_vowels_y
## # A tibble: 6 × 1
## words
## <chr>
## 1 Byelorussian
## 2 Fully-fashioned
## 3 Immunotherapy
## 4 Praseodymium
## 5 Revolutionary
## 6 Uncomplimentary
BankChurners.csvbank <- read_csv("BankChurners.csv")
glimpse(bank)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM <dbl> …
## $ Attrition_Flag <chr> …
## $ Customer_Age <dbl> …
## $ Gender <chr> …
## $ Dependent_count <dbl> …
## $ Education_Level <chr> …
## $ Marital_Status <chr> …
## $ Income_Category <chr> …
## $ Card_Category <chr> …
## $ Months_on_book <dbl> …
## $ Total_Relationship_Count <dbl> …
## $ Months_Inactive_12_mon <dbl> …
## $ Contacts_Count_12_mon <dbl> …
## $ Credit_Limit <dbl> …
## $ Total_Revolving_Bal <dbl> …
## $ Avg_Open_To_Buy <dbl> …
## $ Total_Amt_Chng_Q4_Q1 <dbl> …
## $ Total_Trans_Amt <dbl> …
## $ Total_Trans_Ct <dbl> …
## $ Total_Ct_Chng_Q4_Q1 <dbl> …
## $ Avg_Utilization_Ratio <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
These columns hold a small set of unordered categories:
Existing Customer /
Attrited CustomerM / FMarried,
Single, Divorced, UnknownThese columns have a meaningful ordering:
Uneducated <
High School < College <
Graduate < Post-Graduate <
Doctorate (plus Unknown)Less than $40K <
$40K - $60K < $60K - $80K <
$80K - $120K < $120K + (plus
Unknown)Blue <
Silver < Gold <
Platinumbank <- bank %>%
mutate(
Attrition_Flag = factor(Attrition_Flag),
Gender = factor(Gender),
Marital_Status = factor(Marital_Status),
Education_Level = factor(Education_Level,
levels = c("Uneducated", "High School", "College",
"Graduate", "Post-Graduate",
"Doctorate", "Unknown"),
ordered = TRUE),
Income_Category = factor(Income_Category,
levels = c("Less than $40K", "$40K - $60K",
"$60K - $80K", "$80K - $120K",
"$120K +", "Unknown"),
ordered = TRUE),
Card_Category = factor(Card_Category,
levels = c("Blue", "Silver", "Gold", "Platinum"),
ordered = TRUE)
)
bank %>% select(Education_Level, Income_Category, Card_Category) %>% summary()
## Education_Level Income_Category Card_Category
## Uneducated :1487 Less than $40K:3561 Blue :9436
## High School :2013 $40K - $60K :1790 Silver : 555
## College :1013 $60K - $80K :1402 Gold : 116
## Graduate :3128 $80K - $120K :1535 Platinum: 20
## Post-Graduate: 516 $120K + : 727
## Doctorate : 451 Unknown :1112
## Unknown :1519
bank_edu <- bank %>%
filter(Education_Level != "Unknown") %>%
group_by(Education_Level) %>%
summarise(mean_util = mean(Avg_Utilization_Ratio, na.rm = TRUE),
se = sd(Avg_Utilization_Ratio, na.rm = TRUE) / sqrt(n()),
.groups = "drop")
ggplot(bank_edu, aes(Education_Level, mean_util, fill = Education_Level)) +
geom_col(show.legend = FALSE) +
geom_errorbar(aes(ymin = mean_util - se, ymax = mean_util + se),
width = 0.2) +
labs(title = "Average utilization ratio by education level",
x = "Education level",
y = "Mean Avg_Utilization_Ratio") +
theme_minimal()
ggplot(filter(bank, Education_Level != "Unknown"),
aes(Education_Level, Avg_Utilization_Ratio, fill = Education_Level)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Distribution of utilization ratio by education level",
x = "Education level", y = "Avg_Utilization_Ratio") +
theme_minimal()
The mean utilization ratio is roughly similar (~0.27–0.28) across education levels, suggesting education has only a small effect on how heavily customers use their available credit.
gss_cat data setgss_cat
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near … Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str r… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independe… Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near … Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str d… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong de… Prot… Sout… NA
## 7 2000 Never married 36 White $25000 or more Not str r… Chri… Not … 3
## 8 2000 Divorced 44 White $7000 to 7999 Ind,near … Prot… Luth… NA
## 9 2000 Married 44 White $25000 or more Not str d… Prot… Other 0
## 10 2000 Married 47 White $25000 or more Strong re… Prot… Sout… 3
## # ℹ 21,473 more rows
maritallevels(gss_cat$marital)
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
gss2 <- gss_cat %>%
mutate(marital = fct_collapse(marital,
"Once Married" = c("Separated",
"Divorced",
"Widowed")))
levels(gss2$marital)
## [1] "No answer" "Never married" "Once Married" "Married"
gss2 %>% count(marital)
## # A tibble: 4 × 2
## marital n
## <fct> <int>
## 1 No answer 17
## 2 Never married 5416
## 3 Once Married 5933
## 4 Married 10117
tvhours?gss2 %>%
filter(!is.na(tvhours), marital != "No answer") %>%
group_by(marital) %>%
summarise(mean_tv = mean(tvhours), n = n(), .groups = "drop") %>%
ggplot(aes(fct_reorder(marital, mean_tv), mean_tv, fill = marital)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(mean_tv, 2)), vjust = -0.3) +
labs(title = "Average daily TV hours by (collapsed) marital status",
x = "Marital status", y = "Mean tvhours") +
theme_minimal()
gss2 %>%
filter(!is.na(tvhours), marital != "No answer") %>%
ggplot(aes(marital, tvhours, fill = marital)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "TV hours distribution by marital status",
x = "Marital status", y = "tvhours") +
theme_minimal()
gss_anova <- gss2 %>% filter(!is.na(tvhours), marital != "No answer")
summary(aov(tvhours ~ marital, data = gss_anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## marital 2 1179 589.3 89.36 <2e-16 ***
## Residuals 11325 74685 6.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The “Once Married” and “Never married” groups watch noticeably more TV on average than “Married” respondents; the ANOVA p-value is essentially zero, so the effect is statistically significant.
nycflights13library(nycflights13)
head(flights, 3)
## # A tibble: 3 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(airports, 3)
## # A tibble: 3 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/Chi…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chi…
dest_tz <- flights %>%
distinct(dest) %>%
left_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(tzone)) %>%
distinct(tzone)
dest_tz
## # A tibble: 7 × 1
## tzone
## <chr>
## 1 America/Chicago
## 2 America/New_York
## 3 America/Los_Angeles
## 4 America/Phoenix
## 5 America/Denver
## 6 Pacific/Honolulu
## 7 America/Anchorage
nrow(dest_tz)
## [1] 7
nyc_tz <- airports %>%
filter(faa %in% c("JFK", "LGA", "EWR")) %>%
summarise(tz = unique(tz)) %>% pull(tz)
nyc_tz
## [1] -5
target <- tibble(
city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
faa = c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
)
target %>%
left_join(airports, by = "faa") %>%
mutate(time_diff_from_NYC = tz - nyc_tz) %>%
select(city, faa, name, tz, time_diff_from_NYC)
## # A tibble: 6 × 5
## city faa name tz time_diff_from_NYC
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Chicago ORD Chicago Ohare Intl -6 -1
## 2 Dallas DFW Dallas Fort Worth Intl -6 -1
## 3 Denver DEN Denver Intl -7 -2
## 4 Seattle SEA Seattle Tacoma Intl -8 -3
## 5 Anchorage ANC Ted Stevens Anchorage Intl -9 -4
## 6 Honolulu HNL Honolulu Intl -10 -5
A negative value means the destination is behind New York City.
Time_difference_NYC(dest)Time_difference_NYC <- function(dest) {
nyc_tz <- airports %>% filter(faa == "JFK") %>% pull(tz)
dest_tz <- airports %>% filter(faa == dest) %>% pull(tz)
if (length(dest_tz) == 0) {
warning("FAA code '", dest, "' not found in airports.")
return(NA_real_)
}
dest_tz - nyc_tz
}
# Tests
Time_difference_NYC("LAX") # Los Angeles
## [1] -3
Time_difference_NYC("ORD") # Chicago
## [1] -1
Time_difference_NYC("HNL") # Honolulu
## [1] -5
Time_difference_NYC("ANC") # Anchorage
## [1] -4
flight_time(dep_time, arr_time, origin, dest)flight_time <- function(dep_time, arr_time, origin, dest) {
hhmm_to_hours <- function(x) (x %/% 100) + (x %% 100) / 60
dep_local <- hhmm_to_hours(dep_time)
arr_local <- hhmm_to_hours(arr_time)
origin_tz <- airports %>% filter(faa == origin) %>% pull(tz)
dest_tz <- airports %>% filter(faa == dest) %>% pull(tz)
if (length(origin_tz) == 0 || length(dest_tz) == 0) {
warning("Unknown FAA code for origin or destination.")
return(NA_real_)
}
dep_utc <- dep_local - origin_tz
arr_utc <- arr_local - dest_tz
if (arr_utc < dep_utc) arr_utc <- arr_utc + 24
arr_utc - dep_utc
}
flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
slice(1:5) %>%
rowwise() %>%
mutate(computed_hours = flight_time(dep_time, arr_time, origin, dest),
air_time_hrs = air_time / 60) %>%
select(origin, dest, dep_time, arr_time,
computed_hours, air_time_hrs)
## # A tibble: 5 × 6
## # Rowwise:
## origin dest dep_time arr_time computed_hours air_time_hrs
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 EWR IAH 517 830 4.22 3.78
## 2 LGA IAH 533 850 4.28 3.78
## 3 JFK MIA 542 923 3.68 2.67
## 4 JFK BQN 544 1004 NA 3.05
## 5 LGA ATL 554 812 2.3 1.93
The computed elapsed time should be slightly larger than
air_time (which is in-flight only and excludes taxi time),
confirming the function works.