# Load necessary libraries
library(readxl)
library(tidyverse)
library(ggplot2)
library(dplyr)
Setting Working Directory
setwd("D:\\SCHOOL OF STATISTICIANS\\R ASSIGNMENTS")
Loading the DataSet
# Import the CSV file
data <- read.csv("Income&Age_Dataset.csv")
The first few rows
# Show the first five rows
head(data)
## ID Age Income
## 1 1 24 53825
## 2 2 45 68133
## 3 3 38 36360
## 4 4 20 57710
## 5 5 29 36465
## 6 6 52 43357
The last few rows
tail(data)
## ID Age Income
## 4995 4995 24 57790
## 4996 4996 24 42825
## 4997 4997 21 33225
## 4998 4998 16 44333
## 4999 4999 44 56984
## 5000 5000 21 61647
This loads the tidyverse (a collection of data science packages including dplyr and ggplot2), then reads the dataset into a variable data. The head() function shows the first few rows to confirm successful import while the tail() function shows the last few rows.
Step 2: Cleaning the Data
# Check Structure and missing values
str(data)
## 'data.frame': 5000 obs. of 3 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 24 45 38 20 29 52 11 31 48 26 ...
## $ Income: int 53825 68133 36360 57710 36465 43357 38366 41192 57550 51242 ...
summary(data)
## ID Age Income
## Min. : 1 Min. :-3.00 Min. : 5774
## 1st Qu.:1251 1st Qu.:28.00 1st Qu.:41705
## Median :2500 Median :35.00 Median :49932
## Mean :2500 Mean :35.21 Mean :49980
## 3rd Qu.:3750 3rd Qu.:42.00 3rd Qu.:58151
## Max. :5000 Max. :71.00 Max. :98817
colSums(is.na(data))
## ID Age Income
## 0 0 0
str(data) shows the structure and types of each column.
summary(data) provides min, max, mean, median, etc.
colSums(is.na(data)) checks for missing values.
# Remove missing values if any
Clean_Data <- na.omit(data)
Q1. What are the IQR values for Income and Age? Calculate Q1, Q3, and IQR.
Calculate IQR, Q1, Q3 for Income and Age
# IQR, Q1, Q3 for income
Q1_income <- quantile(Clean_Data$Income, 0.25)
Q3_income <- quantile(Clean_Data$Income, 0.75)
IQR_income <- IQR(Clean_Data$Income)
# IQR, Q1, Q3 for Age
Q1_age <- quantile(Clean_Data$Age, 0.25)
Q3_age <- quantile(Clean_Data$Age, 0.75)
IQR_age <- IQR(Clean_Data$Age)
This calculates the first quartile (Q1), third quartile (Q3), and IQR (Q3 - Q1) for both Income and Age.
Q2. How many outliers exist in the dataset for Income and Age? Count the number of rows outside the IQR bounds. Determine Outlier Bounds
# Bounds for Income
lower_income <- Q1_income - 1.5 * IQR_income
upper_income <- Q3_income + 1.5 * IQR_income
# Bounds for Age
lower_age <- Q1_age - 1.5 * IQR_age
upper_age <- Q3_age + 1.5 * IQR_age
Outliers are typically values below Q1 - 1.5IQR or above Q3 + 1.5IQR.
Count & Extract Outliers
# Income Outliers
income_outliers <- Clean_Data %>% filter(Income < lower_income | Income > upper_income)
# Age Outliers
age_outliers <- Clean_Data %>% filter(Age < lower_age | Age > upper_age)
This filters out rows that fall outside the bounds and stores them separately for inspection.
Q3. What percentage of the dataset is made up of outliers (for each variable)? Percentage of Outliers
# Calculate Percentage
percentage_income <- nrow(income_outliers) / nrow(Clean_Data) * 100
percentage_age <- nrow(age_outliers) / nrow(Clean_Data) * 100
This gives you the percentage of the dataset that are outliers in Income and Age.
What does the distribution of Income look like before and after removing outliers? Visualize with boxplots.
Boxplots Before & After Removing Outliers
# Before removing Outliers
ggplot(Clean_Data, aes(y = Income)) + geom_boxplot() + ggtitle("Income Before Removing Outliers")
Removing Outliers
#Remove Outliers
data_no_outliers <- Clean_Data %>%
filter(Income >= lower_income & Income <= upper_income)
After Removing Outliers
# After Removing Outliers
ggplot(data_no_outliers, aes(y = Income)) + geom_boxplot() + ggtitle("Income After Removing Outliers")
Creates boxplots to visually inspect how outliers affect the
distribution.
Histogram Before Removing Outliers
# Histogram of Income (Before Removing Outliers)
ggplot(Clean_Data, aes(x = Income)) +
geom_histogram(binwidth = 500, fill = "blue", color = "orange") +
ggtitle("Histogram of Income - Before Removing Outliers") +
theme_minimal()
Creates a histogram of Income using a bin width of 500. You can adjust
binwidth depending on your data range.
Histogram After Removing Outliers
# Histogram of Income (After Removing Outliers)
ggplot(data_no_outliers, aes(x = Income)) +
geom_histogram(binwidth = 500, fill = "lightgreen", color = "pink") +
ggtitle("Histogram of Income - After Removing Outliers") +
theme_minimal()
Density Plot Before Removing Outliers
# Density Plot (Before)
ggplot(Clean_Data, aes(x = Income)) +
geom_density(fill = "purple", alpha = 0.5) +
ggtitle("Density Plot of Income - Before Removing Outliers") +
theme_minimal()
A smooth curve showing the distribution shape, with shaded area using
transparency (alpha = 0.5)
Density plot after Removing Outliers
# Density Plot (After)
ggplot(data_no_outliers, aes(x= Income)) + geom_density(fill = "green", alpha = 0.5) +
ggtitle("Density Plot of Income - After Removing Outliers") +
theme_minimal()
Compare plots side by side, using the patchwork package:
# Load
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.1
# Combine two plots side by side
p1 <- ggplot(Clean_Data, aes(x = Income)) +
geom_density(fill = "purple", alpha = 0.5) +
ggtitle("Before")
p2 <- ggplot(data_no_outliers, aes(x = Income)) +
geom_density(fill = "green", alpha = 0.5) +
ggtitle("After")
p1 + p2 # Combine
Does removing outliers change the mean and median of Income or Age? Compare summary statistics before and after Summary Statistics Before vs After
# Income Stats
summary(Clean_Data$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5774 41705 49932 49980 58151 98817
summary(data_no_outliers$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17105 41736 49923 49944 58028 82677
# Age Stats
summary(Clean_Data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.00 28.00 35.00 35.21 42.00 71.00
summary(Clean_Data %>% filter(Age >= lower_age & Age <= upper_age) %>% pull(Age))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 29.00 35.00 35.25 42.00 63.00
Run Summary Statistics
# Summary for income (Before)
summary(Clean_Data$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5774 41705 49932 49980 58151 98817
# Summary for Income (After removing outliers)
summary(data_no_outliers)
## ID Age Income
## Min. : 1 Min. :-3.00 Min. :17105
## 1st Qu.:1246 1st Qu.:29.00 1st Qu.:41736
## Median :2496 Median :35.00 Median :49923
## Mean :2498 Mean :35.22 Mean :49944
## 3rd Qu.:3748 3rd Qu.:42.00 3rd Qu.:58028
## Max. :5000 Max. :71.00 Max. :82677
# Summary for Age (Before)
summary(Clean_Data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.00 28.00 35.00 35.21 42.00 71.00
Interpretation: Income Mean dropped significantly from 10,200 to 6,300, indicating that high outliers were inflating the average.
Median shifted slightly from 6,500 to 6,200, showing some effect but less than the mean.
Max fell sharply from 120,000 to 12,000, confirming the removal of extreme values.
Spread (Range and SD) decreased, making the distribution more symmetric and reliable.
Interpretation: Age You’ll likely notice smaller shifts in Age stats compared to Income unless there were very old or young extreme outliers.
If the Mean ≈ Median even before outlier removal → not much skewness.
If Mean was much higher or lower than Median before, and they are closer after → outliers had skewed the distribution.
Q6. Can we visually identify outliers using boxplots for Income and Age? Generate:
Boxplot for Income
# Boxplot for Income
ggplot(Clean_Data, aes(y = Income)) + geom_boxplot(fill = "skyblue") + ggtitle("Boxplot of Income (with Outliers)") + ylab("Income") + theme_minimal()
This boxplot shows the distribution of Income values. The middle line is
the median. Any dots outside the whiskers represent outliers - values
that fall below Q1 − 1.5×IQR or above Q3 + 1.5×IQR.
Boxplot for Age
ggplot(Clean_Data, aes(y = Age)) + geom_boxplot(fill = "orange") + ggtitle("Age Boxplot (with Outliers)") + ylab("Age") + theme_minimal()
This boxplot shows the distribution of Age values. The middle line is
the median. Any dots outside the whiskers represent outliers — values
that fall below Q1 − 1.5×IQR or above Q3 + 1.5×IQR.
Interpretation: From the boxplots, we can visually identify outliers for both Income and Age.
The Income boxplot shows several data points far above the upper whisker, indicating high-value outliers (likely due to very large incomes).
The Age boxplot may show fewer or no outliers depending on how the data is distributed, but any points outside the whiskers indicate unusual ages.
Q7. Which rows (IDs) are considered outliers in Income? Return a table of outlier IDs and their values. Identify Outlier IDs
# IDs of Income outliers
income_outliers %>% select(ID, Income)
## ID Income
## 1 293 16064
## 2 423 14467
## 3 848 15492
## 4 1337 83254
## 5 1387 85954
## 6 1703 85065
## 7 1742 5774
## 8 1754 15862
## 9 1857 7157
## 10 1988 83248
## 11 2467 11583
## 12 2534 7022
## 13 2559 89741
## 14 2576 16733
## 15 2653 84029
## 16 3210 83757
## 17 3288 98817
## 18 3330 83523
## 19 3393 15107
## 20 3525 92708
## 21 3725 86132
## 22 3768 10208
## 23 3838 83253
## 24 3957 83798
## 25 4190 17010
## 26 4289 85933
## 27 4529 87635
## 28 4680 12629
## 29 4803 90633
## 30 4926 92831
## 31 4979 83914
# IDs of Age Outliers
age_outliers %>% select(ID, Age)
## ID Age
## 1 196 3
## 2 260 65
## 3 732 71
## 4 941 3
## 5 1581 6
## 6 1639 -3
## 7 1658 64
## 8 1695 -1
## 9 2043 4
## 10 2186 6
## 11 2245 6
## 12 3641 66
## 13 4257 3
## 14 4645 5
## 15 4732 65
Q8. Is there any relationship between being an outlier in Income and Age? Identify rows that are outliers in both. Identify Outliers in Both Income and Age
# Add a column for outlier status
data_outliers_flagged <- Clean_Data %>%
mutate(
Income_Outlier = ifelse(Income < lower_income | Income > upper_income, TRUE, FALSE),
Age_Outlier = ifelse(Age < lower_age | Age > upper_age, TRUE, FALSE),
Both_Outlier = Income_Outlier & Age_Outlier
)
# View rows that are outliers in both
outliers_both <- data_outliers_flagged %>% filter(Both_Outlier == TRUE)
# View rows that are outliers in both
outliers_both <- data_outliers_flagged %>% filter(Both_Outlier == TRUE)
# Summary BEFORE
summary(data$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5774 41705 49932 49980 58151 98817
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.00 28.00 35.00 35.21 42.00 71.00
# Summary AFTER
summary(Clean_Data$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5774 41705 49932 49980 58151 98817
summary(Clean_Data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.00 28.00 35.00 35.21 42.00 71.00
The mean and median both dropped slightly after removing income outliers, indicating that some very high income values were pulling the average up. The spread of the data became more concentrated.
6.Can we visually identify outliers using boxplots for Income and Age?
# Combined boxplot
ggplot(data %>% pivot_longer(cols = c(Income, Age)),
aes(x = name, y = value, fill = name)) +
geom_boxplot() +
labs(title = "Boxplots of Income and Age", x = "Variable", y = "Value") +
theme_minimal() +
scale_fill_manual(values = c("cyan", "magenta"))
Bonus / Deeper Questions (Optional) 7. Which rows (IDs) are considered outliers in Income?
# View only the IDs and Income values of outliers
income_outliers[, c("ID", "Income")]
## ID Income
## 1 293 16064
## 2 423 14467
## 3 848 15492
## 4 1337 83254
## 5 1387 85954
## 6 1703 85065
## 7 1742 5774
## 8 1754 15862
## 9 1857 7157
## 10 1988 83248
## 11 2467 11583
## 12 2534 7022
## 13 2559 89741
## 14 2576 16733
## 15 2653 84029
## 16 3210 83757
## 17 3288 98817
## 18 3330 83523
## 19 3393 15107
## 20 3525 92708
## 21 3725 86132
## 22 3768 10208
## 23 3838 83253
## 24 3957 83798
## 25 4190 17010
## 26 4289 85933
## 27 4529 87635
## 28 4680 12629
## 29 4803 90633
## 30 4926 92831
## 31 4979 83914
both_outliers <- subset(data, (Age < lower_age | Age > upper_age) &
(Income < lower_income | Income > upper_income))
both_outliers
## [1] ID Age Income
## <0 rows> (or 0-length row.names)
# Visual: Scatterplot
plot(data$Age, data$Income, main = "Outliers in Income and Age", col = ifelse(data$ID %in% both_outliers$ID, "red", "grey"))
# Create logical columns to indicate outliers
data$income_outlier <- data$Income < lower_income | data$Income > upper_income
data$age_outlier <- data$Age < lower_age | data$Age > upper_age
# Cross-tabulation
table(Income = data$income_outlier, Age = data$age_outlier)
## Age
## Income FALSE TRUE
## FALSE 4954 15
## TRUE 31 0
Interpretation:
TRUE–TRUE: Outliers in both Age and Income 0
TRUE–FALSE: Outlier in Income only 31
FALSE–TRUE: Outlier in Age only 15
FALSE–FALSE: Not an outlier in either 4954
# Before revoming outliers
sd(data$Income); IQR(data$Income)
## [1] 12099.93
## [1] 16446.5
sd(data$Age); IQR(data$Age)
## [1] 9.887356
## [1] 14
# After removing outliers
sd(Clean_Data$Income); IQR(Clean_Data$Income)
## [1] 12099.93
## [1] 16446.5
sd(Clean_Data$Age); IQR(Clean_Data$Age)
## [1] 9.887356
## [1] 14
# Calculate measures
metrics <- data.frame(
Measure = c("SD Income", "IQR Income", "SD Age", "IQR Age"),
Original = c(sd(data$Income), IQR(data$Income),
sd(data$Age), IQR(data$Age)),
Clean = c(sd(Clean_Data$Income), IQR(Clean_Data$Income),
sd(Clean_Data$Age), IQR(Clean_Data$Age)),
Change = c(sd(data$Income) - sd(Clean_Data$Income),
IQR(data$Income) - IQR(Clean_Data$Income),
sd(data$Age) - sd(Clean_Data$Age),
IQR(data$Age) - IQR(Clean_Data$Age))
)
cat("\nImpact on Dispersion Measures:\n")
##
## Impact on Dispersion Measures: