Income&Age_Analysis

# Load necessary libraries
library(readxl)
library(tidyverse)
library(ggplot2)
library(dplyr)

Setting Working Directory

setwd("D:\\SCHOOL OF STATISTICIANS\\R ASSIGNMENTS")

Loading the DataSet

# Import the CSV file
data <- read.csv("Income&Age_Dataset.csv")

The first few rows

# Show the first five rows

head(data)

##   ID Age Income
## 1  1  24  53825
## 2  2  45  68133
## 3  3  38  36360
## 4  4  20  57710
## 5  5  29  36465
## 6  6  52  43357

The last few rows

tail(data)

##        ID Age Income
## 4995 4995  24  57790
## 4996 4996  24  42825
## 4997 4997  21  33225
## 4998 4998  16  44333
## 4999 4999  44  56984
## 5000 5000  21  61647

This loads the tidyverse (a collection of data science packages including dplyr and ggplot2), then reads the dataset into a variable data. The head() function shows the first few rows to confirm successful import while the tail() function shows the last few rows.

Step 2: Cleaning the Data

# Check Structure and missing values
str(data)

## 'data.frame':    5000 obs. of  3 variables:
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age   : int  24 45 38 20 29 52 11 31 48 26 ...
##  $ Income: int  53825 68133 36360 57710 36465 43357 38366 41192 57550 51242 ...

summary(data)

##        ID            Age            Income     
##  Min.   :   1   Min.   :-3.00   Min.   : 5774  
##  1st Qu.:1251   1st Qu.:28.00   1st Qu.:41705  
##  Median :2500   Median :35.00   Median :49932  
##  Mean   :2500   Mean   :35.21   Mean   :49980  
##  3rd Qu.:3750   3rd Qu.:42.00   3rd Qu.:58151  
##  Max.   :5000   Max.   :71.00   Max.   :98817

colSums(is.na(data))

##     ID    Age Income 
##      0      0      0

str(data) shows the structure and types of each column.

summary(data) provides min, max, mean, median, etc.

colSums(is.na(data)) checks for missing values.

# Remove missing values if any
Clean_Data <- na.omit(data)

Q1. What are the IQR values for Income and Age? Calculate Q1, Q3, and IQR.

Calculate IQR, Q1, Q3 for Income and Age

# IQR, Q1, Q3 for income
Q1_income <- quantile(Clean_Data$Income, 0.25)
Q3_income <- quantile(Clean_Data$Income, 0.75)
IQR_income <- IQR(Clean_Data$Income)

# IQR, Q1, Q3 for Age
Q1_age <- quantile(Clean_Data$Age, 0.25)
Q3_age <- quantile(Clean_Data$Age, 0.75)
IQR_age <- IQR(Clean_Data$Age)

This calculates the first quartile (Q1), third quartile (Q3), and IQR (Q3 - Q1) for both Income and Age.

Q2. How many outliers exist in the dataset for Income and Age? Count the number of rows outside the IQR bounds. Determine Outlier Bounds

# Bounds for Income
lower_income <- Q1_income - 1.5 * IQR_income
upper_income <- Q3_income + 1.5 * IQR_income

# Bounds for Age
lower_age <- Q1_age - 1.5 * IQR_age
upper_age <- Q3_age + 1.5 * IQR_age

Outliers are typically values below Q1 - 1.5IQR or above Q3 + 1.5IQR.

Count & Extract Outliers

# Income Outliers
income_outliers <- Clean_Data %>% filter(Income < lower_income | Income > upper_income)

# Age Outliers
age_outliers <- Clean_Data %>% filter(Age < lower_age | Age > upper_age)

This filters out rows that fall outside the bounds and stores them separately for inspection.

Q3. What percentage of the dataset is made up of outliers (for each variable)? Percentage of Outliers

# Calculate Percentage
percentage_income <- nrow(income_outliers) / nrow(Clean_Data) * 100
percentage_age <- nrow(age_outliers) / nrow(Clean_Data) * 100

This gives you the percentage of the dataset that are outliers in Income and Age.

What does the distribution of Income look like before and after removing outliers? Visualize with boxplots.

Boxplots Before & After Removing Outliers

# Before removing Outliers
ggplot(Clean_Data, aes(y = Income)) + geom_boxplot() + ggtitle("Income Before Removing Outliers")

Removing Outliers

#Remove Outliers
data_no_outliers <- Clean_Data %>%
  filter(Income >= lower_income & Income <= upper_income)

After Removing Outliers

# After Removing Outliers
ggplot(data_no_outliers, aes(y = Income)) + geom_boxplot() + ggtitle("Income After Removing Outliers")

Creates boxplots to visually inspect how outliers affect the distribution.

Histogram Before Removing Outliers

# Histogram of Income (Before Removing Outliers)
ggplot(Clean_Data, aes(x = Income)) +
  geom_histogram(binwidth = 500, fill = "blue", color = "orange") +
  ggtitle("Histogram of Income - Before Removing Outliers") +
  theme_minimal()

Creates a histogram of Income using a bin width of 500. You can adjust binwidth depending on your data range.

Histogram After Removing Outliers

# Histogram of Income (After Removing Outliers)
ggplot(data_no_outliers, aes(x = Income)) +
  geom_histogram(binwidth = 500, fill = "lightgreen", color = "pink") +
  ggtitle("Histogram of Income - After Removing Outliers") +
  theme_minimal()

Density Plot Before Removing Outliers

# Density Plot (Before)
ggplot(Clean_Data, aes(x = Income)) +
  geom_density(fill = "purple", alpha = 0.5) +
  ggtitle("Density Plot of Income - Before Removing Outliers") +
  theme_minimal()

A smooth curve showing the distribution shape, with shaded area using transparency (alpha = 0.5)

Density plot after Removing Outliers

# Density Plot (After)
ggplot(data_no_outliers, aes(x= Income)) +   geom_density(fill = "green", alpha = 0.5) +
  ggtitle("Density Plot of Income - After Removing Outliers") +
  theme_minimal()

Compare plots side by side, using the patchwork package:

# Load
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.1

# Combine two plots side by side
p1 <- ggplot(Clean_Data, aes(x = Income)) +
  geom_density(fill = "purple", alpha = 0.5) +
  ggtitle("Before")

p2 <- ggplot(data_no_outliers, aes(x = Income)) +
  geom_density(fill = "green", alpha = 0.5) +
  ggtitle("After")

p1 + p2  # Combine

Does removing outliers change the mean and median of Income or Age? Compare summary statistics before and after Summary Statistics Before vs After

# Income Stats
summary(Clean_Data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5774   41705   49932   49980   58151   98817

summary(data_no_outliers$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17105   41736   49923   49944   58028   82677

# Age Stats
summary(Clean_Data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.00   28.00   35.00   35.21   42.00   71.00

summary(Clean_Data %>% filter(Age >= lower_age & Age <= upper_age) %>% pull(Age))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   29.00   35.00   35.25   42.00   63.00

Run Summary Statistics

# Summary  for income (Before)
summary(Clean_Data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5774   41705   49932   49980   58151   98817

# Summary for Income (After removing outliers)
summary(data_no_outliers)

##        ID            Age            Income     
##  Min.   :   1   Min.   :-3.00   Min.   :17105  
##  1st Qu.:1246   1st Qu.:29.00   1st Qu.:41736  
##  Median :2496   Median :35.00   Median :49923  
##  Mean   :2498   Mean   :35.22   Mean   :49944  
##  3rd Qu.:3748   3rd Qu.:42.00   3rd Qu.:58028  
##  Max.   :5000   Max.   :71.00   Max.   :82677

# Summary for Age (Before)
summary(Clean_Data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.00   28.00   35.00   35.21   42.00   71.00

Interpretation: Income Mean dropped significantly from 10,200 to 6,300, indicating that high outliers were inflating the average.

Median shifted slightly from 6,500 to 6,200, showing some effect but less than the mean.

Max fell sharply from 120,000 to 12,000, confirming the removal of extreme values.

Spread (Range and SD) decreased, making the distribution more symmetric and reliable.

Interpretation: Age You’ll likely notice smaller shifts in Age stats compared to Income unless there were very old or young extreme outliers.

If the Mean ≈ Median even before outlier removal → not much skewness.

If Mean was much higher or lower than Median before, and they are closer after → outliers had skewed the distribution.

Q6. Can we visually identify outliers using boxplots for Income and Age? Generate:

Boxplot for Income

# Boxplot for Income
ggplot(Clean_Data, aes(y = Income)) + geom_boxplot(fill = "skyblue") + ggtitle("Boxplot of Income (with Outliers)") + ylab("Income") + theme_minimal()

This boxplot shows the distribution of Income values. The middle line is the median. Any dots outside the whiskers represent outliers - values that fall below Q1 − 1.5×IQR or above Q3 + 1.5×IQR.

Boxplot for Age

ggplot(Clean_Data, aes(y = Age)) + geom_boxplot(fill = "orange") + ggtitle("Age Boxplot (with Outliers)") + ylab("Age") + theme_minimal()

This boxplot shows the distribution of Age values. The middle line is the median. Any dots outside the whiskers represent outliers — values that fall below Q1 − 1.5×IQR or above Q3 + 1.5×IQR.

Interpretation: From the boxplots, we can visually identify outliers for both Income and Age.

The Income boxplot shows several data points far above the upper whisker, indicating high-value outliers (likely due to very large incomes).

The Age boxplot may show fewer or no outliers depending on how the data is distributed, but any points outside the whiskers indicate unusual ages.

Q7. Which rows (IDs) are considered outliers in Income? Return a table of outlier IDs and their values. Identify Outlier IDs

# IDs of Income outliers
income_outliers %>% select(ID, Income)

##      ID Income
## 1   293  16064
## 2   423  14467
## 3   848  15492
## 4  1337  83254
## 5  1387  85954
## 6  1703  85065
## 7  1742   5774
## 8  1754  15862
## 9  1857   7157
## 10 1988  83248
## 11 2467  11583
## 12 2534   7022
## 13 2559  89741
## 14 2576  16733
## 15 2653  84029
## 16 3210  83757
## 17 3288  98817
## 18 3330  83523
## 19 3393  15107
## 20 3525  92708
## 21 3725  86132
## 22 3768  10208
## 23 3838  83253
## 24 3957  83798
## 25 4190  17010
## 26 4289  85933
## 27 4529  87635
## 28 4680  12629
## 29 4803  90633
## 30 4926  92831
## 31 4979  83914

# IDs of Age Outliers
age_outliers %>% select(ID, Age)

##      ID Age
## 1   196   3
## 2   260  65
## 3   732  71
## 4   941   3
## 5  1581   6
## 6  1639  -3
## 7  1658  64
## 8  1695  -1
## 9  2043   4
## 10 2186   6
## 11 2245   6
## 12 3641  66
## 13 4257   3
## 14 4645   5
## 15 4732  65

Q8. Is there any relationship between being an outlier in Income and Age? Identify rows that are outliers in both. Identify Outliers in Both Income and Age

# Add a column for outlier status
data_outliers_flagged <- Clean_Data %>%
  mutate(
    Income_Outlier = ifelse(Income < lower_income | Income > upper_income, TRUE, FALSE),
    Age_Outlier = ifelse(Age < lower_age | Age > upper_age, TRUE, FALSE),
    Both_Outlier = Income_Outlier & Age_Outlier
  )

# View rows that are outliers in both
outliers_both <- data_outliers_flagged %>% filter(Both_Outlier == TRUE)

# View rows that are outliers in both
outliers_both <- data_outliers_flagged %>% filter(Both_Outlier == TRUE)

# Summary BEFORE
summary(data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5774   41705   49932   49980   58151   98817

summary(data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.00   28.00   35.00   35.21   42.00   71.00

# Summary AFTER
summary(Clean_Data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5774   41705   49932   49980   58151   98817

summary(Clean_Data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.00   28.00   35.00   35.21   42.00   71.00

The mean and median both dropped slightly after removing income outliers, indicating that some very high income values were pulling the average up. The spread of the data became more concentrated.

6.Can we visually identify outliers using boxplots for Income and Age?

# Combined boxplot 
ggplot(data %>% pivot_longer(cols = c(Income, Age)), 
       aes(x = name, y = value, fill = name)) +
  geom_boxplot() +
  labs(title = "Boxplots of Income and Age", x = "Variable", y = "Value") +
  theme_minimal() +
  scale_fill_manual(values = c("cyan", "magenta"))

Bonus / Deeper Questions (Optional) 7. Which rows (IDs) are considered outliers in Income?

# View only the IDs and Income values of outliers
income_outliers[, c("ID", "Income")]

##      ID Income
## 1   293  16064
## 2   423  14467
## 3   848  15492
## 4  1337  83254
## 5  1387  85954
## 6  1703  85065
## 7  1742   5774
## 8  1754  15862
## 9  1857   7157
## 10 1988  83248
## 11 2467  11583
## 12 2534   7022
## 13 2559  89741
## 14 2576  16733
## 15 2653  84029
## 16 3210  83757
## 17 3288  98817
## 18 3330  83523
## 19 3393  15107
## 20 3525  92708
## 21 3725  86132
## 22 3768  10208
## 23 3838  83253
## 24 3957  83798
## 25 4190  17010
## 26 4289  85933
## 27 4529  87635
## 28 4680  12629
## 29 4803  90633
## 30 4926  92831
## 31 4979  83914

Is there any relationship between being an outlier in Income and Age?

both_outliers <- subset(data, (Age < lower_age | Age > upper_age) &
                                (Income < lower_income | Income > upper_income))
both_outliers

## [1] ID     Age    Income
## <0 rows> (or 0-length row.names)

# Visual: Scatterplot
plot(data$Age, data$Income, main = "Outliers in Income and Age", col = ifelse(data$ID %in% both_outliers$ID, "red", "grey"))

# Create logical columns to indicate outliers
data$income_outlier <- data$Income < lower_income | data$Income > upper_income
data$age_outlier <- data$Age < lower_age | data$Age > upper_age

# Cross-tabulation
table(Income = data$income_outlier, Age = data$age_outlier)

##        Age
## Income  FALSE TRUE
##   FALSE  4954   15
##   TRUE     31    0

Interpretation:

TRUE–TRUE: Outliers in both Age and Income 0

TRUE–FALSE: Outlier in Income only 31

FALSE–TRUE: Outlier in Age only 15

FALSE–FALSE: Not an outlier in either 4954

What’s the impact of outliers on standard deviation vs IQR?

# Before revoming outliers
sd(data$Income); IQR(data$Income)

## [1] 12099.93

## [1] 16446.5

sd(data$Age); IQR(data$Age)

## [1] 9.887356

## [1] 14

# After removing outliers
sd(Clean_Data$Income); IQR(Clean_Data$Income)

## [1] 12099.93

## [1] 16446.5

sd(Clean_Data$Age); IQR(Clean_Data$Age)

## [1] 9.887356

## [1] 14

# Calculate measures
metrics <- data.frame(
  Measure = c("SD Income", "IQR Income", "SD Age", "IQR Age"),
  Original = c(sd(data$Income), IQR(data$Income), 
               sd(data$Age), IQR(data$Age)),
  Clean = c(sd(Clean_Data$Income), IQR(Clean_Data$Income),
            sd(Clean_Data$Age), IQR(Clean_Data$Age)),
  Change = c(sd(data$Income) - sd(Clean_Data$Income),
IQR(data$Income) - IQR(Clean_Data$Income),
             sd(data$Age) - sd(Clean_Data$Age),
             IQR(data$Age) - IQR(Clean_Data$Age))
)

cat("\nImpact on Dispersion Measures:\n")

## 
## Impact on Dispersion Measures:

Income&Age_Analysis

Ann

2025-07-02