library(dplyr)
library(readr)
library(ggplot2)
setwd("~/Desktop/datasets")
UkSmoking <- read_csv("11263-Smoking_tcm86-13253.csv")Project 2
Project Overview
I will be analyzing smoking data in the UK. The dataset I’m using is from the UK STEM Foundation, and contains a survey of 1,693 total observations, 12 columns/variables (9 categorical, 3 numerical). Each variable is defined below:
“Smoke?”: categorical, binary (yes/no). Categorizes people who do/don’t smoke
“Highest Qualification”: categorical, ordinal. Different education levels of each observation
“Age”: numerical, discrete. The age of each observation
“Marital Status”: categorical, nominal. The marital status of each observation
“Sex”: categorical, binary (male/female). The sex of each observation
“Region”: categorical, nominal. The region each observation lives in
“Nationality”: categorical, nominal. The nationality of each observation
“Ethnicity”: categorical, nominal. The ethnicity of each observation
“Gross Income”: categorical, ordinal. The gross income of each observation by ranges
Research Question
“To what extent are education level, smoking, and marital status associated with age?” I chose this topic and dataset because I am interested in how smoking is categorized by different lifestyles and stages of life. I will be trying to find the correlation between different variables in this dataset, focusing on using age (dependent variable), against smoking status, education level, and marital status.
Cleaning
To clean the dataset, I removed the last 3 rows (Type, Amount Weekdays, Amount Weekends), as they had mostly NAs. Afterwards, I removed the value “99” under the Highest Qualification column, as it only had 1 observation and therefore was an outlier.
Load libraries and dataset
Exploration
#check the structure of the dataset and variable types
str(UkSmoking)spc_tbl_ [1,693 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Sex : chr [1:1693] "Male" "Female" "Male" "Female" ...
$ Age : num [1:1693] 38 42 40 40 39 37 53 44 40 41 ...
$ Marital Status : chr [1:1693] "Divorced" "Single" "Married" "Married" ...
$ Highest Qualification: chr [1:1693] "No Qualification" "No Qualification" "Degree" "Degree" ...
$ Nationality : chr [1:1693] "British" "British" "English" "English" ...
$ Ethnicity : chr [1:1693] "White" "White" "White" "White" ...
$ Gross Income : chr [1:1693] "£2600 to less than £5200" "Less than £2600" "£28600 to less than £36400" "£10400 to less than £15600" ...
$ Region : chr [1:1693] "The North" "The North" "The North" "The North" ...
$ Smoke? : chr [1:1693] "No" "Yes" "No" "No" ...
$ Amount Weekends : chr [1:1693] "N/A" "12" "N/A" "N/A" ...
$ Amount Weekdays : chr [1:1693] "N/A" "12" "N/A" "N/A" ...
$ Type : chr [1:1693] "N/A" "Packets" "N/A" "N/A" ...
- attr(*, "spec")=
.. cols(
.. Sex = col_character(),
.. Age = col_double(),
.. `Marital Status` = col_character(),
.. `Highest Qualification` = col_character(),
.. Nationality = col_character(),
.. Ethnicity = col_character(),
.. `Gross Income` = col_character(),
.. Region = col_character(),
.. `Smoke?` = col_character(),
.. `Amount Weekends` = col_character(),
.. `Amount Weekdays` = col_character(),
.. Type = col_character()
.. )
- attr(*, "problems")=<externalptr>
#summary of the dataset- verifying number of observations
summary(UkSmoking) Sex Age Marital Status Highest Qualification
Length:1693 Min. :16.00 Length:1693 Length:1693
Class :character 1st Qu.:34.00 Class :character Class :character
Mode :character Median :48.00 Mode :character Mode :character
Mean :49.82
3rd Qu.:65.00
Max. :97.00
Nationality Ethnicity Gross Income Region
Length:1693 Length:1693 Length:1693 Length:1693
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Smoke? Amount Weekends Amount Weekdays Type
Length:1693 Length:1693 Length:1693 Length:1693
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
UkSmoking |>
group_by(`Highest Qualification`) |>
count(`Smoke?`)# A tibble: 17 × 3
# Groups: Highest Qualification [9]
`Highest Qualification` `Smoke?` n
<chr> <chr> <int>
1 99 Yes 1
2 A Levels No 84
3 A Levels Yes 21
4 Degree No 223
5 Degree Yes 39
6 GCSE/CSE No 64
7 GCSE/CSE Yes 39
8 GCSE/O Level No 203
9 GCSE/O Level Yes 105
10 Higher/Sub Degree No 98
11 Higher/Sub Degree Yes 27
12 No Qualification No 449
13 No Qualification Yes 137
14 ONC/BTEC No 53
15 ONC/BTEC Yes 23
16 Other/Sub Degree No 96
17 Other/Sub Degree Yes 31
Visuals
#smokers based on education levels:
Smoke_qualification <- UkSmoking |>
#filter to smokers only
filter(`Smoke?` == "Yes") |>
#count the amount of smokers in each qualification category
count(`Highest Qualification`)
#bar chart of smokers based on education levels
ggplot(Smoke_qualification, aes(x = `Highest Qualification`, y = n, fill = `Highest Qualification`)) +
geom_col() +
theme_minimal() +
labs(
title = "Smokers vs their Highest Qualification level",
caption = "Source: UK STEM Foundation",
x = "Qualification Type", y = "Amount of Smokers") +
theme(axis.text.x = element_blank())#mean age of smokers v.s nonsmokers
Smoking_age <- UkSmoking |>
group_by(`Smoke?`) |>
summarise(mean_age = mean(Age, na.rm = TRUE))
#bar chart of the mean age of smokers v.s nonsmokers
ggplot(Smoking_age, aes(x = `Smoke?`, y = mean_age, fill = `Smoke?`)) +
geom_col() +
theme_minimal() +
labs(
title = "Smoker and Non Smoker Mean Age",
caption = "Source: UK STEM Foundation",
x = "Smoke? (yes/no)", y = "Mean Age")#mean age across marital status
Age_Marriage <- UkSmoking |>
group_by(`Marital Status`) |>
summarise(mean_age = mean(Age, na.rm = TRUE))
#bar chart of the mean age across marital status
ggplot(Age_Marriage, aes(x = `Marital Status`, y = mean_age, fill = `Marital Status`)) +
geom_col() +
theme_minimal() +
labs(
title = "Mean Age vs Marital Status",
caption = "Source: UK STEM Foundation",
x = "Marital Status", y = "Mean Age")Cleaning
#remove last 3 columns; obvious, many NAs
UkSmoking <- UkSmoking|>
select(-Type, -`Amount Weekdays`, -`Amount Weekends`) |>
#remove qualification level "99". Only 1 value, it is an outlier
filter(`Highest Qualification` != "99")
sum(is.na(UkSmoking))[1] 0
#no more NAs- cleaning is doneMultiple Linear Regression
I’m choosing to use education level, smoking status, and marital status because they are closely linked to varying age groups/life stages. I’m using backwards elimination so I can analyze the individual significance of each variable.
#fit multiple linear regression: age ~ highest qualification + smoking + marital status
fit1 <- lm(Age ~ `Smoke?` + `Highest Qualification` + `Marital Status`, data = UkSmoking)
#view the results
summary(fit1)
Call:
lm(formula = Age ~ `Smoke?` + `Highest Qualification` + `Marital Status`,
data = UkSmoking)
Residuals:
Min 1Q Median 3Q Max
-40.870 -9.133 -0.812 8.749 45.341
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 45.59611 1.63772 27.841 < 2e-16
`Smoke?`Yes -5.67916 0.75931 -7.479 1.2e-13
`Highest Qualification`Degree 3.18828 1.51099 2.110 0.035001
`Highest Qualification`GCSE/CSE 0.67273 1.82013 0.370 0.711723
`Highest Qualification`GCSE/O Level 1.82579 1.48335 1.231 0.218548
`Highest Qualification`Higher/Sub Degree 6.68877 1.73249 3.861 0.000117
`Highest Qualification`No Qualification 15.56806 1.41561 10.997 < 2e-16
`Highest Qualification`ONC/BTEC 0.08643 1.97173 0.044 0.965043
`Highest Qualification`Other/Sub Degree 11.86545 1.73454 6.841 1.1e-11
`Marital Status`Married -0.53349 1.14035 -0.468 0.639969
`Marital Status`Separated -5.27405 1.88408 -2.799 0.005180
`Marital Status`Single -14.60949 1.22043 -11.971 < 2e-16
`Marital Status`Widowed 16.38522 1.38102 11.865 < 2e-16
(Intercept) ***
`Smoke?`Yes ***
`Highest Qualification`Degree *
`Highest Qualification`GCSE/CSE
`Highest Qualification`GCSE/O Level
`Highest Qualification`Higher/Sub Degree ***
`Highest Qualification`No Qualification ***
`Highest Qualification`ONC/BTEC
`Highest Qualification`Other/Sub Degree ***
`Marital Status`Married
`Marital Status`Separated **
`Marital Status`Single ***
`Marital Status`Widowed ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.07 on 1679 degrees of freedom
Multiple R-squared: 0.5165, Adjusted R-squared: 0.513
F-statistic: 149.5 on 12 and 1679 DF, p-value: < 2.2e-16
” Highest QualificationONC/BTEC ” has the highest pvalue of 0.965. We need to remove this.
#create a new dataset that excludes ONC/BTEC
UkSmoking_filtered <- UkSmoking |>
filter(`Highest Qualification` != "ONC/BTEC")
#run the model on the filtered data
fit2 <- lm(Age ~ `Smoke?` + `Highest Qualification` + `Marital Status`, data = UkSmoking_filtered)
summary(fit2)
Call:
lm(formula = Age ~ `Smoke?` + `Highest Qualification` + `Marital Status`,
data = UkSmoking_filtered)
Residuals:
Min 1Q Median 3Q Max
-40.690 -9.005 -0.723 8.730 45.271
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 45.4173 1.6504 27.519 < 2e-16
`Smoke?`Yes -5.8827 0.7814 -7.529 8.51e-14
`Highest Qualification`Degree 3.1699 1.5150 2.092 0.036571
`Highest Qualification`GCSE/CSE 0.6949 1.8253 0.381 0.703459
`Highest Qualification`GCSE/O Level 1.8533 1.4875 1.246 0.212994
`Highest Qualification`Higher/Sub Degree 6.6819 1.7371 3.846 0.000125
`Highest Qualification`No Qualification 15.5796 1.4200 10.972 < 2e-16
`Highest Qualification`Other/Sub Degree 11.8882 1.7395 6.834 1.17e-11
`Marital Status`Married -0.3175 1.1589 -0.274 0.784133
`Marital Status`Separated -4.4068 1.9455 -2.265 0.023639
`Marital Status`Single -14.3831 1.2450 -11.553 < 2e-16
`Marital Status`Widowed 16.5763 1.3969 11.867 < 2e-16
(Intercept) ***
`Smoke?`Yes ***
`Highest Qualification`Degree *
`Highest Qualification`GCSE/CSE
`Highest Qualification`GCSE/O Level
`Highest Qualification`Higher/Sub Degree ***
`Highest Qualification`No Qualification ***
`Highest Qualification`Other/Sub Degree ***
`Marital Status`Married
`Marital Status`Separated *
`Marital Status`Single ***
`Marital Status`Widowed ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.11 on 1604 degrees of freedom
Multiple R-squared: 0.5145, Adjusted R-squared: 0.5111
F-statistic: 154.5 on 11 and 1604 DF, p-value: < 2.2e-16
Regression conclusion
final equation:
significant pvalues:
Smoke? Yes (\(p = 8.51e-14\))
Highest Qualification Higher/Sub Degree (\(p = 0.000125\))
Highest Qualification No Qualification (\(p < 2e-16\))
Highest Qualification Other/Sub Degree (\(p = 1.17e-11\))
Marital Status Single (\(p < 2e-16\))
Marital Status Widowed (\(p < 2e-16\))
adjusted R^2: 0.5111
Written conclusion: The adjusted R^2 shows that the model explains approximately 51.11% of the variance in age. The results indicate that being single or a smoker is associated with a lower average age, and having no formal qualifications or being widowed are the strongest predictors of a higher average age.
Final Visualization:
Create ShinyApp Histogram for Age, optional filtering for smoking status
library(shiny)
#UI/page details
ui <- fluidPage(
titlePanel("Distribution of Age"),
sidebarLayout(
sidebarPanel(
sliderInput(inputId = "bins",
label = "Number of bins:",
min = 1,
max = 30,
value = 15),
#adding a checkbox for smoking status
checkboxInput(inputId = "onlySmokers",
label = "Filter for Smokers Only",
value = FALSE)
),
mainPanel(
plotOutput(outputId = "distPlot")
)
)
)
server <- function(input, output) {
output$distPlot <- renderPlot({
#filter based on checkbox. If the box is checked, filter the plot data to smokers. If else, set the plot data to the default (UkSmoking)
if (input$onlySmokers) {
plot_data <- UkSmoking[UkSmoking$'Smoke?' == "Yes", ]
} else {
plot_data <- UkSmoking
}
#select age
x <- plot_data$Age
bins <- seq(min(x), max(x), length.out = input$bins + 1)
#create histogram
hist(x, breaks = bins, col = "#FFB6C1", border = "white",
xlab = "Age (Years)",
main = "Histogram of Ages",
#add a subtitle of the sample size based on the checkbox filter
sub = paste("Sample Size: n =", length(x)))
#vertical line for mean age
abline(v = mean(x), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Mean Age:", round(mean(x), 1)),
col = "red", lwd = 2, lty = 2)
})
}
shinyApp(ui = ui, server = server)Sources:
Chalabi, Mona. “Unemployed and Single? Who Are Britain’s Smokers?” The Guardian, The Guardian, 26 Sept. 2013, www.theguardian.com/news/datablog/2013/sep/26/unemployed-single-britain-smokers-uk-cigarette-statistics. Accessed 22 Apr. 2026.