Load Dataset
df <- read_csv("dataset.csv")
## Rows: 3000 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GENDER, LUNG_CANCER
## dbl (14): AGE, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC_DISE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert appropriate columns to factors
df <- df %>% mutate(across(c(GENDER, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC_DISEASE,
FATIGUE, ALLERGY, WHEEZING, ALCOHOL_CONSUMING, COUGHING, SHORTNESS_OF_BREATH,
SWALLOWING_DIFFICULTY, CHEST_PAIN, LUNG_CANCER), as.factor))
Descriptive
Analysis
Distribution of
Age
summary(df$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.00 42.00 55.00 55.17 68.00 80.00
ggplot(df, aes(x = AGE)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Age", x = "Age", y = "Count") +
theme_minimal()

Gender
Percentage
gender_pct <- df %>%
group_by(GENDER) %>%
summarise(Count = n()) %>%
mutate(Percentage = Count / sum(Count) * 100)
gender_pct
## # A tibble: 2 × 3
## GENDER Count Percentage
## <fct> <int> <dbl>
## 1 F 1486 49.5
## 2 M 1514 50.5
ggplot(gender_pct, aes(x = "", y = Percentage, fill = GENDER)) +
geom_col() +
coord_polar("y") +
labs(title = "Gender Distribution") +
theme_void()

Lung Cancer
Percentage
cancer_pct <- df %>%
group_by(LUNG_CANCER) %>%
summarise(Count = n()) %>%
mutate(Percentage = Count / sum(Count) * 100)
cancer_pct
## # A tibble: 2 × 3
## LUNG_CANCER Count Percentage
## <fct> <int> <dbl>
## 1 NO 1482 49.4
## 2 YES 1518 50.6
ggplot(cancer_pct, aes(x = "", y = Percentage, fill = LUNG_CANCER)) +
geom_col() +
coord_polar("y") +
labs(title = "Lung Cancer Distribution") +
theme_void()

Average Age by Lung
Cancer Status
avg_age <- df %>%
group_by(LUNG_CANCER) %>%
summarise(Average_Age = mean(AGE))
avg_age
## # A tibble: 2 × 2
## LUNG_CANCER Average_Age
## <fct> <dbl>
## 1 NO 55.7
## 2 YES 54.6
ggplot(df, aes(x = LUNG_CANCER, y = AGE, fill = LUNG_CANCER)) +
geom_boxplot() +
labs(title = "Average Age of Lung Cancer Patients vs Non-patients")

Smokers and Lung
Cancer
smokers <- df %>%
filter(SMOKING == 1) %>%
group_by(LUNG_CANCER) %>%
summarise(Count = n())
smokers
## # A tibble: 2 × 2
## LUNG_CANCER Count
## <fct> <int>
## 1 NO 765
## 2 YES 762
Correlation & Risk
Factor Analysis
Smoking vs Lung
Cancer
ggplot(df, aes(x = SMOKING, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Smoking vs Lung Cancer")

Chronic Disease
Impact
ggplot(df, aes(x = CHRONIC_DISEASE, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Chronic Disease vs Lung Cancer")

Alcohol Consumption
Impact
ggplot(df, aes(x = ALCOHOL_CONSUMING, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Alcohol Consumption vs Lung Cancer")

Wheezing
Relation
ggplot(df, aes(x = WHEEZING, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Wheezing vs Lung Cancer")

Fatigue & Anxiety
Effects
# Fatigue
ggplot(df, aes(x = FATIGUE, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Fatigue vs Lung Cancer")

# Anxiety
ggplot(df, aes(x = ANXIETY, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Anxiety vs Lung Cancer")

Chronic Disease &
Chest Pain Combined
df$COMBINED <- interaction(df$CHRONIC_DISEASE, df$CHEST_PAIN)
ggplot(df, aes(x = COMBINED, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Chronic Disease & Chest Pain vs Lung Cancer")

Comparative
Analysis
Gender vs Lung
Cancer
ggplot(df, aes(x = GENDER, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Gender vs Lung Cancer")

Yellow Fingers vs
Lung Cancer
ggplot(df, aes(x = YELLOW_FINGERS, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Yellow Fingers vs Lung Cancer")

Smokers vs
Non-Smokers Comparison
ggplot(df, aes(x = SMOKING, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Smokers vs Non-Smokers and Lung Cancer")

Peer Pressure
Analysis
# Peer Pressure -> Smoking
ggplot(df, aes(x = PEER_PRESSURE, fill = SMOKING)) +
geom_bar(position = "fill") +
labs(title = "Peer Pressure vs Smoking")

# Peer Pressure -> Lung Cancer
ggplot(df, aes(x = PEER_PRESSURE, fill = LUNG_CANCER)) +
geom_bar(position = "fill") +
labs(title = "Peer Pressure vs Lung Cancer")

Predictive
Analysis
Logistic Regression
(Age, Smoking, Chronic Disease)
log_model <- glm(LUNG_CANCER ~ AGE + SMOKING + CHRONIC_DISEASE, data = df, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = LUNG_CANCER ~ AGE + SMOKING + CHRONIC_DISEASE,
## family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.288630 0.152130 1.897 0.0578 .
## AGE -0.004860 0.002485 -1.956 0.0505 .
## SMOKING2 0.056227 0.073195 0.768 0.4424
## CHRONIC_DISEASE2 -0.047257 0.073209 -0.646 0.5186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.5 on 2999 degrees of freedom
## Residual deviance: 4153.7 on 2996 degrees of freedom
## AIC: 4161.7
##
## Number of Fisher Scoring iterations: 3
Linear Regression
(Age vs Lung Cancer)
df$LUNG_CANCER_NUM <- as.numeric(df$LUNG_CANCER)
age_model <- lm(LUNG_CANCER_NUM ~ AGE, data = df)
summary(age_model)
##
## Call:
## lm(formula = LUNG_CANCER_NUM ~ AGE, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5365 -0.5050 0.4647 0.4926 0.5241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5729124 0.0353941 44.440 <2e-16 ***
## AGE -0.0012129 0.0006199 -1.957 0.0505 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4998 on 2998 degrees of freedom
## Multiple R-squared: 0.001275, Adjusted R-squared: 0.0009422
## F-statistic: 3.828 on 1 and 2998 DF, p-value: 0.05048
Final Summary
- Age is slightly positively associated with lung
cancer.
- Smoking, yellow fingers, and
peer pressure increase lung cancer risk.
- Fatigue, wheezing, and
anxiety are common among lung cancer patients.
- Chronic disease and alcohol
consumption have a moderate effect.
- Regression shows that age alone is a weak predictor
but still has an impact.
THANK YOU