1 Load Dataset

df <- read_csv("dataset.csv")
## Rows: 3000 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): GENDER, LUNG_CANCER
## dbl (14): AGE, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC_DISE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert appropriate columns to factors
df <- df %>% mutate(across(c(GENDER, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC_DISEASE,
                             FATIGUE, ALLERGY, WHEEZING, ALCOHOL_CONSUMING, COUGHING, SHORTNESS_OF_BREATH,
                             SWALLOWING_DIFFICULTY, CHEST_PAIN, LUNG_CANCER), as.factor))

2 Descriptive Analysis

2.1 Distribution of Age

summary(df$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.00   42.00   55.00   55.17   68.00   80.00
ggplot(df, aes(x = AGE)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Age", x = "Age", y = "Count") +
  theme_minimal()

2.2 Gender Percentage

gender_pct <- df %>%
  group_by(GENDER) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = Count / sum(Count) * 100)

gender_pct
## # A tibble: 2 × 3
##   GENDER Count Percentage
##   <fct>  <int>      <dbl>
## 1 F       1486       49.5
## 2 M       1514       50.5
ggplot(gender_pct, aes(x = "", y = Percentage, fill = GENDER)) +
  geom_col() +
  coord_polar("y") +
  labs(title = "Gender Distribution") +
  theme_void()

2.3 Lung Cancer Percentage

cancer_pct <- df %>%
  group_by(LUNG_CANCER) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = Count / sum(Count) * 100)

cancer_pct
## # A tibble: 2 × 3
##   LUNG_CANCER Count Percentage
##   <fct>       <int>      <dbl>
## 1 NO           1482       49.4
## 2 YES          1518       50.6
ggplot(cancer_pct, aes(x = "", y = Percentage, fill = LUNG_CANCER)) +
  geom_col() +
  coord_polar("y") +
  labs(title = "Lung Cancer Distribution") +
  theme_void()

2.4 Average Age by Lung Cancer Status

avg_age <- df %>%
  group_by(LUNG_CANCER) %>%
  summarise(Average_Age = mean(AGE))

avg_age
## # A tibble: 2 × 2
##   LUNG_CANCER Average_Age
##   <fct>             <dbl>
## 1 NO                 55.7
## 2 YES                54.6
ggplot(df, aes(x = LUNG_CANCER, y = AGE, fill = LUNG_CANCER)) +
  geom_boxplot() +
  labs(title = "Average Age of Lung Cancer Patients vs Non-patients")

2.5 Smokers and Lung Cancer

smokers <- df %>%
  filter(SMOKING == 1) %>%
  group_by(LUNG_CANCER) %>%
  summarise(Count = n())

smokers
## # A tibble: 2 × 2
##   LUNG_CANCER Count
##   <fct>       <int>
## 1 NO            765
## 2 YES           762

3 Correlation & Risk Factor Analysis

3.1 Smoking vs Lung Cancer

ggplot(df, aes(x = SMOKING, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Smoking vs Lung Cancer")

3.2 Chronic Disease Impact

ggplot(df, aes(x = CHRONIC_DISEASE, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Chronic Disease vs Lung Cancer")

3.3 Alcohol Consumption Impact

ggplot(df, aes(x = ALCOHOL_CONSUMING, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Alcohol Consumption vs Lung Cancer")

3.4 Wheezing Relation

ggplot(df, aes(x = WHEEZING, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Wheezing vs Lung Cancer")

3.5 Fatigue & Anxiety Effects

# Fatigue

ggplot(df, aes(x = FATIGUE, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Fatigue vs Lung Cancer")

# Anxiety

ggplot(df, aes(x = ANXIETY, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Anxiety vs Lung Cancer")

3.6 Chronic Disease & Chest Pain Combined

df$COMBINED <- interaction(df$CHRONIC_DISEASE, df$CHEST_PAIN)

ggplot(df, aes(x = COMBINED, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Chronic Disease & Chest Pain vs Lung Cancer")

4 Comparative Analysis

4.1 Gender vs Lung Cancer

ggplot(df, aes(x = GENDER, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Gender vs Lung Cancer")

4.2 Yellow Fingers vs Lung Cancer

ggplot(df, aes(x = YELLOW_FINGERS, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Yellow Fingers vs Lung Cancer")

4.3 Smokers vs Non-Smokers Comparison

ggplot(df, aes(x = SMOKING, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Smokers vs Non-Smokers and Lung Cancer")

4.4 Peer Pressure Analysis

# Peer Pressure -> Smoking

ggplot(df, aes(x = PEER_PRESSURE, fill = SMOKING)) +
  geom_bar(position = "fill") +
  labs(title = "Peer Pressure vs Smoking")

# Peer Pressure -> Lung Cancer

ggplot(df, aes(x = PEER_PRESSURE, fill = LUNG_CANCER)) +
  geom_bar(position = "fill") +
  labs(title = "Peer Pressure vs Lung Cancer")

5 Predictive Analysis

5.1 Logistic Regression (Age, Smoking, Chronic Disease)

log_model <- glm(LUNG_CANCER ~ AGE + SMOKING + CHRONIC_DISEASE, data = df, family = binomial)
summary(log_model)
## 
## Call:
## glm(formula = LUNG_CANCER ~ AGE + SMOKING + CHRONIC_DISEASE, 
##     family = binomial, data = df)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)       0.288630   0.152130   1.897   0.0578 .
## AGE              -0.004860   0.002485  -1.956   0.0505 .
## SMOKING2          0.056227   0.073195   0.768   0.4424  
## CHRONIC_DISEASE2 -0.047257   0.073209  -0.646   0.5186  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.5  on 2999  degrees of freedom
## Residual deviance: 4153.7  on 2996  degrees of freedom
## AIC: 4161.7
## 
## Number of Fisher Scoring iterations: 3

5.2 Linear Regression (Age vs Lung Cancer)

df$LUNG_CANCER_NUM <- as.numeric(df$LUNG_CANCER)
age_model <- lm(LUNG_CANCER_NUM ~ AGE, data = df)
summary(age_model)
## 
## Call:
## lm(formula = LUNG_CANCER_NUM ~ AGE, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5365 -0.5050  0.4647  0.4926  0.5241 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.5729124  0.0353941  44.440   <2e-16 ***
## AGE         -0.0012129  0.0006199  -1.957   0.0505 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4998 on 2998 degrees of freedom
## Multiple R-squared:  0.001275,   Adjusted R-squared:  0.0009422 
## F-statistic: 3.828 on 1 and 2998 DF,  p-value: 0.05048

6 Final Summary

7 THANK YOU