LIBRARIES USED

ISLR

ggplot2

dplyr

tidyr

library(ISLR)
library(ggplot2)
library(dplyr)
library(tidyr)
data(package="ISLR")
data1= ISLR::Credit

CREDIT DATASET

summary(Credit)
       ID            Income           Limit           Rating     
 Min.   :  1.0   Min.   : 10.35   Min.   :  855   Min.   : 93.0  
 1st Qu.:100.8   1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2  
 Median :200.5   Median : 33.12   Median : 4622   Median :344.0  
 Mean   :200.5   Mean   : 45.22   Mean   : 4736   Mean   :354.9  
 3rd Qu.:300.2   3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2  
 Max.   :400.0   Max.   :186.63   Max.   :13913   Max.   :982.0  
     Cards            Age          Education        Gender    Student  
 Min.   :1.000   Min.   :23.00   Min.   : 5.00    Male :193   No :360  
 1st Qu.:2.000   1st Qu.:41.75   1st Qu.:11.00   Female:207   Yes: 40  
 Median :3.000   Median :56.00   Median :14.00                         
 Mean   :2.958   Mean   :55.67   Mean   :13.45                         
 3rd Qu.:4.000   3rd Qu.:70.00   3rd Qu.:16.00                         
 Max.   :9.000   Max.   :98.00   Max.   :20.00                         
 Married              Ethnicity      Balance       
 No :155   African American: 99   Min.   :   0.00  
 Yes:245   Asian           :102   1st Qu.:  68.75  
           Caucasian       :199   Median : 459.50  
                                  Mean   : 520.01  
                                  3rd Qu.: 863.00  
                                  Max.   :1999.00  

Plots used for Credit Dataset

Histogram

Credit Score Distribution: What is the overall distribution of credit scores in the dataset?

ggplot(data1, aes(x = Rating)) + 
  geom_histogram(color = "black", fill = "lightgreen", bins = 20) +  # Set the number of bins if needed
  labs(title = "Distribution of Credit Ratings", x = "Credit Rating", y = "Frequency") +
  theme_minimal()  # Optional: Adds a minimalistic theme

Box plot

Credit Score by Age Group: How do credit scores vary across different age groups within the dataset?

data1 <- data1 %>%
  mutate(Age = case_when(
    Age  >= 18 & Age  < 30 ~ "18-30",
    Age  >= 30 & Age  < 40 ~ "31-39",
    Age  >= 40 & Age  < 50 ~ "40-49",
    Age  >= 50 & Age  < 60 ~ "50-59",
    Age  >= 60 & Age  < 65 ~ "60-65",
    Age  >= 65 ~ "65+",
    TRUE ~ NA_character_ ))

ggplot(data1, aes(x = Age, y = Rating)) +
  geom_boxplot(fill = "white") +
  labs(title = "Credit Rating by Age Group", x = "Age Group", y = "Credit Rating")

Barplot

Impact of Payment History: How does payment history affect the average credit score among individuals?

data <- data1 %>%
  mutate(Balance = case_when(
    Balance < 500 ~ "<500",
    Balance >= 500 & Balance < 1000 ~ "500-999",
    Balance >= 1000 & Balance < 1500 ~ "1000-1499",
    Balance >= 1500 ~ "1500+"
  ))

average_credit_rating <- data %>%
  group_by(Balance) %>%
  summarize(Rating = mean(Rating, na.rm = TRUE))

ggplot(average_credit_rating, aes(x = Balance, y = Rating, fill = Balance)) +
  geom_bar(stat = "identity", color = "black") +
  labs(title = "Average Credit Score by Balance Group", 
       x = "Balance Group", 
       y = "Average Credit Score") +
  scale_fill_manual(values = c("red", "blue", "yellow", "green"))

Scatter Plot

Credit Utilization Ratio: What is the relationship between credit utilization ratios and credit scores in the dataset?

data1$credit_utilization <- data1$Balance

suppressMessages(
  ggplot(data1, aes(x = credit_utilization, y = Rating)) + 
    geom_point(alpha = 0.9, color = "blue") + 
    geom_smooth(method = "lm", color = "black", se = FALSE) + 
    labs(title = "Credit Utilization Ratio vs. Credit Rating", 
         x = "Credit Utilization Ratio", 
         y = "Credit Rating")
)
## `geom_smooth()` using formula = 'y ~ x'

Line plot

Credit Score Trends: How have credit scores changed over the past few years? Are there noticeable trends?

trend_data <- data1 %>%
  group_by(Cards) %>%
  summarize(Average_Rating = mean(Rating, na.rm = TRUE))

# Plot
ggplot(trend_data, aes(x = Cards, y = Average_Rating)) + 
  geom_line(color = "black", linewidth = 0.8) +  
  geom_point(color = "red", size = 3, shape = 16) + 
  labs(title = "Average Credit Rating by Number of Cards", 
       x = "Number of Cards", 
       y = "Average Credit Rating") + 
  theme_minimal()

library(ISLR)
library(ggplot2)
library(dplyr)
library(tidyr)
data(package="ISLR")
data1= ISLR::Wage

WAGE DATASET

summary(Wage)
      year           age                     maritl           race     
 Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
 1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
 Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
 Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
 3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
 Max.   :2009   Max.   :80.00                                          
                                                                       
              education                     region               jobclass   
 1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
 2. HS Grad        :971   1. New England       :   0   2. Information:1456  
 3. Some College   :650   3. East North Central:   0                        
 4. College Grad   :685   4. West North Central:   0                        
 5. Advanced Degree:426   5. South Atlantic    :   0                        
                          6. East South Central:   0                        
                          (Other)              :   0                        
            health      health_ins      logwage           wage       
 1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
 2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
                                     Median :4.653   Median :104.92  
                                     Mean   :4.654   Mean   :111.70  
                                     3rd Qu.:4.857   3rd Qu.:128.68  
                                     Max.   :5.763   Max.   :318.34  
                                                                     

Plots used for Wage Dataset

Histogram

Wage Distribution: What is the distribution of wages in the dataset, and how does it compare to national averages?

ggplot(data1, aes(x = wage)) + 
  geom_histogram(binwidth = 8, fill = "yellow", color = "black") + 
  labs(title = "Wage Distribution", x = "Wage", y = "Frequency") +
  theme_minimal()

national_average <- 50

ggplot(data1, aes(x = wage)) + 
  geom_histogram(binwidth = 8, fill = "darkgreen", color = "black", alpha = 0.6) + 
  geom_vline(xintercept = national_average, color = "red", linetype = "dashed", linewidth = 1) +  # Use linewidth
  labs(
    title = "Wage Distribution with National Average",
    x = "Wage",
    y = "Frequency"
  ) +
  theme_minimal() +
  annotate(
    "text",
    x = national_average + 5,  # Position text slightly further for better visibility
    y = 200,
    label = "National Avg",
    color = "navyblue",
    fontface = "bold",
    size = 4
  ) +
  theme()

Bar plot

Average Wage by Industry: What are the average wages across different industries represented in the dataset?

average_wage_by_industry <- data1 %>%
  group_by(jobclass) %>%
  summarize(Average_Wage = mean(wage, na.rm = TRUE))

ggplot(average_wage_by_industry, aes(x = order(jobclass, Average_Wage), y = Average_Wage, fill = jobclass)) + geom_bar(stat = "identity") + 
  labs(title = "Average Wage by jobclass ", x = "jobclass", y = "Average Wage") + 
  theme_minimal() + 
  coord_flip()

Line plot

Wage Growth Over Time: How have wages changed over time within the dataset? Are there specific periods of growth or decline?

average_wage_over_time <- data1 %>%
  group_by(year) %>%
  summarize(Average_Wage = mean(wage, na.rm = TRUE))

ggplot(average_wage_over_time, aes(x = year, y = Average_Wage)) + 
  geom_line(color = "blue") + 
  geom_point() + 
  labs(title = "Wage Growth Over Time", x = "Year", y = "Average Wage") + 
  theme_minimal()

Box plot

Wage vs Education Level: How do wages differ by education level, and is there a significant correlation between education and wage?

ggplot(data1, aes(x = education, y = wage, fill = education)) + 
  geom_boxplot() + 
  labs(title = "Wage Distribution by Education Level", x = "Education Level", y = "Wage") + 
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")  

Density

Impact of Employment Type: How do wages vary between full-time, part-time, and contract workers in the dataset?

ggplot(data1, aes(x = wage, fill = jobclass, color = jobclass)) + 
  geom_density(alpha = 0.4) +  
  labs(title = "Density Plot of Wages by Employment Type", 
       x = "Wage", y = "Density") + 
  theme_minimal() +
  theme(legend.position = "right") +  
  scale_fill_brewer(palette = "Set2") + 
  scale_color_brewer(palette = "Set2")