summary(adult[, c("age","hours.per.week")])
## age hours.per.week
## Min. :17.00 Min. : 1.00
## 1st Qu.:28.00 1st Qu.:40.00
## Median :37.00 Median :40.00
## Mean :38.58 Mean :40.44
## 3rd Qu.:48.00 3rd Qu.:45.00
## Max. :90.00 Max. :99.00
Answer (1a):
The median age is 37 and the mean is ≈38.6 (close),
suggesting an approximately symmetric age distribution with older
outliers up to 90.
Hours/week centers strongly at 40 (median=40,
mean≈40.4; Q1=40, Q3=45), indicating a spike at 40 and a right tail
(rare high hours near 99).
plot(density(adult$age, na.rm=TRUE), main="", xlab="Value")
lines(density(adult$hours.per.week, na.rm=TRUE), col="red")
legend("topright", c("age","hours/week"), lty=1, col=c("black","red"))
Densities: age (black) vs hours/week (red)
boxplot(adult$age, adult$hours.per.week,
names=c("age","hours/week"),
main="Boxplots: Age vs Hours/Week")
Boxplots: Age vs Hours/Week
Answer (1b):
Hours/week is tightly concentrated at 40 with many high-end outliers;
age is more broadly spread. This matches the quartiles in 1(a).
num_cols <- intersect(c("age","education.num","capital.gain","capital.loss","hours.per.week"),
names(adult))
if (length(num_cols) >= 2) {
pairs(adult[, num_cols], main="Scatterplot Matrix (selected numeric variables)")
} else {
cat("Selected numeric columns not all present in this file.\n")
}
Answer (1c):
Capital gain/loss are mostly zeros with a few extreme
values (heavy skew). Hours/week piles at 40. A loose
positive relation appears between age and
education.num—patterns that are hard to see from 1-D
plots alone.
tbl_workclass <- sort(table(adult$workclass), decreasing=TRUE); tbl_workclass
##
## Private Self-emp-not-inc Local-gov State-gov
## 22696 2541 2093 1298
## Self-emp-inc Federal-gov Without-pay Never-worked
## 1116 960 14 7
barplot(tbl_workclass, las=2, main="workclass")
tbl_edu <- sort(table(adult$education), decreasing=TRUE); tbl_edu
##
## HS-grad Some-college Bachelors Masters Assoc-voc 11th
## 10501 7291 5355 1723 1382 1175
## Assoc-acdm 10th 7th-8th Prof-school 9th 12th
## 1067 933 646 576 514 433
## Doctorate 5th-6th 1st-4th Preschool
## 413 333 168 51
barplot(tbl_edu, las=2, main="education")
tbl_sex <- table(adult$sex); tbl_sex
##
## Female Male
## 10771 21790
barplot(tbl_sex, main="sex")
Counts by workclass, education, and sex
Answer (1d):
Imbalances exist: “Private” dominates workclass;
HS-grad and Bachelors are large
education groups; male counts exceed female.
tab <- table(adult$sex, adult$income, dnn=c("sex","income"))
tab
## income
## sex <=50K >50K
## Female 9592 1179
## Male 15128 6662
round(prop.table(tab, 1)*100, 1)
## income
## sex <=50K >50K
## Female 89.1 10.9
## Male 69.4 30.6
barplot(tab, beside=TRUE, legend=TRUE,
main="Income by Sex", xlab="Sex", ylab="Count")
mosaicplot(tab, main="Mosaic: Income by Sex", color=TRUE)
Income by Sex: counts and mosaic
Answer (1e):
Men have a higher >50K proportion; women are more
concentrated in ≤50K. The mosaic matches the table
percentages.
even <- population_even; odd <- population_odd
names(even) <- toupper(names(even)); names(odd) <- toupper(names(odd))
year_pat <- "20(1[0-9])"
year_cols_even <- grep(year_pat, names(even), value=TRUE)
year_cols_odd <- grep(year_pat, names(odd), value=TRUE)
id_cols_even <- setdiff(names(even), year_cols_even)
id_cols_odd <- setdiff(names(odd), year_cols_odd)
common_id_cols <- intersect(id_cols_even, id_cols_odd)
to_long <- function(df, id_cols, year_cols) {
df %>%
pivot_longer(all_of(year_cols), names_to="year_raw", values_to="population") %>%
mutate(year = sub(".*(20(1[0-9])).*","\\1", year_raw)) %>%
select(all_of(id_cols), year, population)
}
even_long <- to_long(even, common_id_cols, year_cols_even)
odd_long <- to_long(odd, common_id_cols, year_cols_odd)
pop_long <- bind_rows(even_long, odd_long) %>% filter(year >= "2010", year <= "2019")
pop_wide <- pop_long %>% pivot_wider(names_from=year, values_from=population) %>%
arrange(across(all_of(common_id_cols)))
head(pop_wide)
## # A tibble: 6 × 12
## STATE NAME `2010` `2012` `2014` `2016` `2018` `2011` `2013` `2015` `2017`
## <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 Alabama 4.79e6 4.82e6 4.84e6 4.86e6 4.89e6 4.80e6 4.83e6 4.85e6 4.87e6
## 2 2 Alaska 7.14e5 7.30e5 7.36e5 7.41e5 7.35e5 7.22e5 7.37e5 7.37e5 7.40e5
## 3 4 Arizona 6.41e6 6.55e6 6.73e6 6.94e6 7.16e6 NA 6.63e6 6.83e6 7.04e6
## 4 5 Arkansas 2.92e6 2.95e6 2.97e6 2.99e6 3.01e6 2.94e6 2.96e6 2.98e6 3.00e6
## 5 6 Califor… 3.73e7 3.79e7 3.86e7 3.92e7 3.95e7 3.76e7 3.83e7 3.89e7 3.94e7
## 6 8 Colorado 5.05e6 5.19e6 5.35e6 5.54e6 5.69e6 5.12e6 5.27e6 5.45e6 5.61e6
## # ℹ 1 more variable: `2019` <int>
Answer (2a):
Reshaped both files to long, combined, filtered to
2010–2019, then widened so each state is one row with
year columns.
names(pop_wide) <- ifelse(str_detect(names(pop_wide), year_pat),
str_extract(names(pop_wide), year_pat),
names(pop_wide))
years <- intersect(as.character(2010:2019), names(pop_wide))
id_now <- setdiff(names(pop_wide), years)
pop_wide <- pop_wide %>% select(all_of(id_now), all_of(years))
names(pop_wide)
## [1] "STATE" "NAME" "2010" "2011" "2012" "2013" "2014" "2015" "2016"
## [10] "2017" "2018" "2019"
head(pop_wide)
## # A tibble: 6 × 12
## STATE NAME `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018`
## <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 Alabama 4.79e6 4.80e6 4.82e6 4.83e6 4.84e6 4.85e6 4.86e6 4.87e6 4.89e6
## 2 2 Alaska 7.14e5 7.22e5 7.30e5 7.37e5 7.36e5 7.37e5 7.41e5 7.40e5 7.35e5
## 3 4 Arizona 6.41e6 NA 6.55e6 6.63e6 6.73e6 6.83e6 6.94e6 7.04e6 7.16e6
## 4 5 Arkansas 2.92e6 2.94e6 2.95e6 2.96e6 2.97e6 2.98e6 2.99e6 3.00e6 3.01e6
## 5 6 Califor… 3.73e7 3.76e7 3.79e7 3.83e7 3.86e7 3.89e7 3.92e7 3.94e7 3.95e7
## 6 8 Colorado 5.05e6 5.12e6 5.19e6 5.27e6 5.35e6 5.45e6 5.54e6 5.61e6 5.69e6
## # ℹ 1 more variable: `2019` <int>
Answer (2b):
Year columns are cleaned to YYYY and ordered
2010→2019, with IDs first.
pop_wide[years] <- lapply(pop_wide[years], function(x) as.numeric(gsub(",", "", x)))
for (yr in 2011:2018) {
prev <- as.character(yr - 1); nxt <- as.character(yr + 1); cur <- as.character(yr)
needs <- is.na(pop_wide[[cur]]) & !is.na(pop_wide[[prev]]) & !is.na(pop_wide[[nxt]])
pop_wide[[cur]][needs] <- (pop_wide[[prev]][needs] + pop_wide[[nxt]][needs]) / 2
}
colSums(is.na(pop_wide[years]))
## 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
## 0 0 0 0 0 0 0 0 0 1
Answer (2c):
Any single missing internal year (e.g., 2016) is
imputed with the mean of neighbors (2015 & 2017). Edge or multi-year
gaps remain NA.
pop_wide <- pop_wide %>%
rowwise() %>%
mutate(
max_pop_single_year = max(c_across(all_of(years)), na.rm = TRUE),
max_pop_year = {
vals <- c_across(all_of(years))
if (all(is.na(vals))) NA_character_
else years[ which.max(replace(vals, is.na(vals), -Inf)) ]
},
total_pop_all_years = sum(c_across(all_of(years)), na.rm = TRUE)
) %>%
ungroup()
head(pop_wide %>% select(any_of(common_id_cols), max_pop_single_year, max_pop_year, total_pop_all_years))
## # A tibble: 6 × 5
## STATE NAME max_pop_single_year max_pop_year total_pop_all_years
## <int> <chr> <dbl> <chr> <dbl>
## 1 1 Alabama 4903185 2019 48453198
## 2 2 Alaska 741456 2016 7325170
## 3 4 Arizona 7278717 2019 68057899
## 4 5 Arkansas 3017804 2019 29738435
## 5 6 California 39512223 2019 386181900
## 6 8 Colorado 5758736 2019 54031986
Answer (2d):
For each state, computed its single-year maximum
population and the sum over all 10 years (just changing
max → sum).
year_of_interest <- "2019"
us_total_single_year <- sum(pop_wide[[year_of_interest]], na.rm=TRUE)
format(us_total_single_year, big.mark = ",")
## [1] "325,610,783"
Answer (2e):
The total US population in 2019 is the sum across
states (value shown above).
pop_long3 <- pop_wide %>%
pivot_longer(cols=all_of(years), names_to="year", values_to="population") %>%
mutate(year = as.integer(year))
states_to_plot <- c("Illinois","California","Texas")
state_col <- if ("NAME" %in% names(pop_wide)) "NAME" else common_id_cols[1]
plot_df <- pop_long3 %>% filter(.data[[state_col]] %in% states_to_plot)
ggplot(plot_df, aes(year, population, color = !!rlang::sym(state_col),
group = !!rlang::sym(state_col))) +
geom_line(linewidth=1) + geom_point() +
scale_x_continuous(breaks=2010:2019) +
scale_y_continuous(labels = comma) +
labs(title="State Population Over Time (2010–2019)",
x="Year", y="Population", color="State") +
theme_minimal(base_size=12)
Answer (3):
Reshaped to long (year, population), converted
year to integer, filtered to three states, and plotted
lines by state to compare levels and trends.
4(a) Two kinds of dirty data & fixes
• Missing values → impute (median/mean;
forward/backfill for time series) or drop if too extensive.
• Inconsistent categories/formatting → trim/lowercase,
map to canonical labels; validate against allowed lists.
(Others: duplicates → de-dup on keys; unit mismatches → convert;
obvious outliers → investigate or winsorize.)
4(b) Map questions to data-mining
functionality
• “Five groups of customers who buy similar things?” →
Clustering/segmentation (e.g., k-means,
hierarchical).
• “Predict if a customer will buy milk based on what else they bought?”
→ Classification (e.g., logistic regression, trees,
RF/boosting).
• “What sets of products are often purchased together?” →
Association rules (Apriori/FP-growth; report support,
confidence, lift).
4(c) Is it data mining?
• Organizing customers by education level → No (simple
grouping).
• Computing total sales → No (basic
aggregation/OLAP).
• Sorting by student ID → No (data management).
• Predicting outcomes of tossing a fair pair of dice →
No (known random process).
• Predicting a company’s future stock price from history →
Yes (predictive/time-series modeling; hard/noisy, but a
DM task).