The dataset is loaded from the Github repository. Columns from 2 to 7 are used.
#install.packages("plyr")
#library(plyr)
## Reading .csv from Github
CreditCard <- read.csv("https://raw.githubusercontent.com/deepasharma06/R-HW3/main/CreditCard.csv",
stringsAsFactors = FALSE)
summary(CreditCard)
## X card reports age
## Min. : 1.0 Length:1319 Min. : 0.0000 Min. : 0.1667
## 1st Qu.: 330.5 Class :character 1st Qu.: 0.0000 1st Qu.:25.4167
## Median : 660.0 Mode :character Median : 0.0000 Median :31.2500
## Mean : 660.0 Mean : 0.4564 Mean :33.2131
## 3rd Qu.: 989.5 3rd Qu.: 0.0000 3rd Qu.:39.4167
## Max. :1319.0 Max. :14.0000 Max. :83.5000
## income share expenditure owner
## Min. : 0.210 Min. :0.0001091 Min. : 0.000 Length:1319
## 1st Qu.: 2.244 1st Qu.:0.0023159 1st Qu.: 4.583 Class :character
## Median : 2.900 Median :0.0388272 Median : 101.298 Mode :character
## Mean : 3.365 Mean :0.0687322 Mean : 185.057
## 3rd Qu.: 4.000 3rd Qu.:0.0936168 3rd Qu.: 249.036
## Max. :13.500 Max. :0.9063205 Max. :3099.505
## selfemp dependents months majorcards
## Length:1319 Min. :0.0000 Min. : 0.00 Min. :0.0000
## Class :character 1st Qu.:0.0000 1st Qu.: 12.00 1st Qu.:1.0000
## Mode :character Median :1.0000 Median : 30.00 Median :1.0000
## Mean :0.9939 Mean : 55.27 Mean :0.8173
## 3rd Qu.:2.0000 3rd Qu.: 72.00 3rd Qu.:1.0000
## Max. :6.0000 Max. :540.00 Max. :1.0000
## active
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 6.000
## Mean : 6.997
## 3rd Qu.:11.000
## Max. :46.000
colnames(CreditCard) <- c("card", "reports", "age", "income", "share", "expenditure", "owner")
summary(CreditCard)
## card reports age income
## Min. : 1.0 Length:1319 Min. : 0.0000 Min. : 0.1667
## 1st Qu.: 330.5 Class :character 1st Qu.: 0.0000 1st Qu.:25.4167
## Median : 660.0 Mode :character Median : 0.0000 Median :31.2500
## Mean : 660.0 Mean : 0.4564 Mean :33.2131
## 3rd Qu.: 989.5 3rd Qu.: 0.0000 3rd Qu.:39.4167
## Max. :1319.0 Max. :14.0000 Max. :83.5000
## share expenditure owner NA
## Min. : 0.210 Min. :0.0001091 Min. : 0.000 Length:1319
## 1st Qu.: 2.244 1st Qu.:0.0023159 1st Qu.: 4.583 Class :character
## Median : 2.900 Median :0.0388272 Median : 101.298 Mode :character
## Mean : 3.365 Mean :0.0687322 Mean : 185.057
## 3rd Qu.: 4.000 3rd Qu.:0.0936168 3rd Qu.: 249.036
## Max. :13.500 Max. :0.9063205 Max. :3099.505
## NA NA NA NA
## Length:1319 Min. :0.0000 Min. : 0.00 Min. :0.0000
## Class :character 1st Qu.:0.0000 1st Qu.: 12.00 1st Qu.:1.0000
## Mode :character Median :1.0000 Median : 30.00 Median :1.0000
## Mean :0.9939 Mean : 55.27 Mean :0.8173
## 3rd Qu.:2.0000 3rd Qu.: 72.00 3rd Qu.:1.0000
## Max. :6.0000 Max. :540.00 Max. :1.0000
## NA
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 6.000
## Mean : 6.997
## 3rd Qu.:11.000
## Max. :46.000
# Q1: Data Exploration:
mean_age <- mean(CreditCard[,3])
median_age <- median(CreditCard[,3])
mean_income <- mean(CreditCard[,4])
median_income <- median(CreditCard[,4])
mean_expenditure <- mean(CreditCard[,6])
median_expenditure <- median(CreditCard[,6])
print(sprintf("Mean age is %f, median age %f", mean_age, median_age))
## [1] "Mean age is 0.456406, median age 0.000000"
print(sprintf("Mean income is %f, median income %f", mean_income, median_income))
## [1] "Mean income is 33.213103, median income 31.250000"
print(sprintf("Mean expediture is %f, median expenditure %f", mean_expenditure, median_expenditure))
## [1] "Mean expediture is 0.068732, median expenditure 0.038827"
## Q2: Data Wrangling / creating new data frame with a new order of data
# CreditCard
CreditCard_new <- CreditCard[order(CreditCard$income, decreasing = TRUE),]
#Q2: Data Wrangling / Renaming columns
names(CreditCard_new)[names(CreditCard_new)=="card"] <- "Card#"
names(CreditCard_new)[names(CreditCard_new)=="reports"] <- "Reports"
names(CreditCard_new)[names(CreditCard_new)=="age"] <- "Age_in_Years"
names(CreditCard_new)[names(CreditCard_new)=="income"] <- "Income_in_1000"
names(CreditCard_new)[names(CreditCard_new)=="expenditure"] <- "Expenditure_in_1000"
# print (CreditCard_new)
#Analyze a subset of data with a certain column value greater than a specified value (in this example, the Income_in_1000 > 50)
CreditCard_new1 = subset(CreditCard_new, Income_in_1000>50)
summary(CreditCard_new1)
## Card# Reports Age_in_Years Income_in_1000
## Min. : 82.0 Length:84 Min. :0.000 Min. :50.33
## 1st Qu.: 418.5 Class :character 1st Qu.:0.000 1st Qu.:52.94
## Median : 734.5 Mode :character Median :0.000 Median :55.12
## Mean : 713.8 Mean :0.369 Mean :57.34
## 3rd Qu.:1038.5 3rd Qu.:0.000 3rd Qu.:60.44
## Max. :1299.0 Max. :6.000 Max. :83.50
## share Expenditure_in_1000 owner NA
## Min. : 1.450 Min. :0.0001091 Min. : 0.00 Length:84
## 1st Qu.: 2.500 1st Qu.:0.0008000 1st Qu.: 0.00 Class :character
## Median : 3.700 Median :0.0227894 Median : 72.47 Mode :character
## Mean : 4.232 Mean :0.0439334 Mean : 175.47
## 3rd Qu.: 5.318 3rd Qu.:0.0625904 3rd Qu.: 213.09
## Max. :12.500 Max. :0.2975549 Max. :3099.51
## NA.1 NA.2 NA.3 NA.4
## Length:84 Min. :0.000 Min. : 1.00 Min. :0.00
## Class :character 1st Qu.:0.000 1st Qu.: 38.75 1st Qu.:0.75
## Mode :character Median :1.000 Median :120.00 Median :1.00
## Mean :0.881 Mean :149.10 Mean :0.75
## 3rd Qu.:1.000 3rd Qu.:240.00 3rd Qu.:1.00
## Max. :4.000 Max. :540.00 Max. :1.00
## NA.5
## Min. : 0.00
## 1st Qu.: 3.00
## Median : 6.50
## Mean : 8.06
## 3rd Qu.:13.00
## Max. :44.00
#Note: the Income_in_1000 Min value is 50.33 so the subset is correct.
#Q3: Install ggplot package
installed.packages("ggplot2")
## Package LibPath Version Priority Depends Imports LinkingTo Suggests
## Enhances License License_is_FOSS License_restricts_use OS_type Archs
## MD5sum NeedsCompilation Built
library(ggplot2)
# #Q3: Scatter Plot
ggplot(CreditCard_new1, aes(x=Income_in_1000, y=Expenditure_in_1000)) + geom_point(color="red", size = 1, alpha=1) + scale_x_continuous("Income_in_1000$") + scale_y_continuous("Expenditure_in_1000$") + theme_minimal()

# Analysis: We see that the higher plots of expenditure is towards the lower income range.
#Q3: Boxplot / of report in the x axis and Expenditure in Y axis
crop=ggplot(data=CreditCard_new1, mapping=aes(x=Reports, y=Expenditure_in_1000))+geom_boxplot()
crop

#Q3: Adding Mean value to the Box Plot (purple point)
ggplot(CreditCard_new1, aes(x=Income_in_1000, y=Expenditure_in_1000, fill = Income_in_1000)) +
geom_boxplot() +
stat_summary(fun = "mean", geom = "point", shape = 10,
size = 1, color = "Purple")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

# The purple points shows the mean value of expenditure for each range of income. We can see that the means are higher towards the left where the income is lower.
#A3 - Histogram of data - Expenditure_in_1000
ggplot(data = CreditCard_new1, aes(x = Income_in_1000)) + geom_histogram(bins = 10)

#A3 - Histogram of data - Expenditure_in_1000
ggplot(data = CreditCard_new1, aes(x = Expenditure_in_1000)) + geom_histogram(bins = 10)

#Q4: Linera regression of expenditure and Income
ggplot(data=CreditCard_new1, aes(x=Expenditure_in_1000, y=Income_in_1000, group=1)) +
geom_line()+
geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
