# Read insurace.csv file from github and store it in a data frame.
insurance <- read.csv("https://raw.githubusercontent.com/miachen410/InsuranceClaims/master/Insurance.csv")
# 1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.
summary(insurance)
## X District Group Age Holders
## Min. : 1.00 Min. :1.00 <1l :16 <25 :16 Min. : 3.00
## 1st Qu.:16.75 1st Qu.:1.75 >2l :16 >35 :16 1st Qu.: 46.75
## Median :32.50 Median :2.50 1-1.5l:16 25-29:16 Median : 136.00
## Mean :32.50 Mean :2.50 1.5-2l:16 30-35:16 Mean : 364.98
## 3rd Qu.:48.25 3rd Qu.:3.25 3rd Qu.: 327.50
## Max. :64.00 Max. :4.00 Max. :3582.00
## Claims
## Min. : 0.00
## 1st Qu.: 9.50
## Median : 22.00
## Mean : 49.23
## 3rd Qu.: 55.50
## Max. :400.00
cat("\n") #print a blank line
# Print average number of policyholders
avgHolders <- round(mean(insurance$Holders), 2)
paste("Average number of policyholders is", avgHolders)
## [1] "Average number of policyholders is 364.98"
# Print median number of policyholders
medHolders <- median(insurance$Holders)
paste("Median number of policyholders is", medHolders)
## [1] "Median number of policyholders is 136"
# Print average insurance claims
avgClaims <- round(mean(insurance$Claims), 2)
paste("Average number of insurance claims is", avgClaims)
## [1] "Average number of insurance claims is 49.23"
# Print median insurance claims
medClaims <- median(insurance$Claims)
paste("Median number of insurance claims is", medClaims)
## [1] "Median number of insurance claims is 22"
# 2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# Subset groups of policyholders under age 25 that have at least 30 members
young <- subset(insurance, Age == "<25" & Holders >= 30)
young
## X District Group Age Holders Claims
## 1 1 1 <1l <25 197 38
## 5 5 1 1-1.5l <25 284 63
## 9 9 1 1.5-2l <25 133 19
## 17 17 2 <1l <25 85 22
## 21 21 2 1-1.5l <25 149 25
## 25 25 2 1.5-2l <25 66 14
## 33 33 3 <1l <25 35 5
## 37 37 3 1-1.5l <25 53 10
## 53 53 4 1-1.5l <25 31 7
# 3. Create new column names for the new data frame.
names(young) <- c("Y", "YDistrict", "YGroup", "YAge", "YHolders", "YClaims")
young
## Y YDistrict YGroup YAge YHolders YClaims
## 1 1 1 <1l <25 197 38
## 5 5 1 1-1.5l <25 284 63
## 9 9 1 1.5-2l <25 133 19
## 17 17 2 <1l <25 85 22
## 21 21 2 1-1.5l <25 149 25
## 25 25 2 1.5-2l <25 66 14
## 33 33 3 <1l <25 35 5
## 37 37 3 1-1.5l <25 53 10
## 53 53 4 1-1.5l <25 31 7
# 4. Use the summary function to create an overview of your new data frame. Then print the mean and median for the same two attributes. Please compare.
summary(young)
## Y YDistrict YGroup YAge YHolders
## Min. : 1.00 Min. :1.000 <1l :3 <25 :9 Min. : 31.0
## 1st Qu.: 9.00 1st Qu.:1.000 >2l :0 >35 :0 1st Qu.: 53.0
## Median :21.00 Median :2.000 1-1.5l:4 25-29:0 Median : 85.0
## Mean :22.33 Mean :2.111 1.5-2l:2 30-35:0 Mean :114.8
## 3rd Qu.:33.00 3rd Qu.:3.000 3rd Qu.:149.0
## Max. :53.00 Max. :4.000 Max. :284.0
## YClaims
## Min. : 5.00
## 1st Qu.:10.00
## Median :19.00
## Mean :22.56
## 3rd Qu.:25.00
## Max. :63.00
cat("\n") #print a blank line
# Print and compare average number of young policyholders with the overall average
avgYholders <- round(mean(young$YHolders), 2)
paste("Average number of young policyholders is", avgYholders, ifelse(avgYholders<avgHolders, ", less than average", ", more than or equal to average"), avgHolders)
## [1] "Average number of young policyholders is 114.78 , less than average 364.98"
# Print and compare median number of young policyholders with the overall median
medYholders <- median(young$YHolders)
paste("Median number of young policyholders is", medYholders, ifelse(medYholders<=medHolders, ", less than median", ", more than or equal to median"), medHolders)
## [1] "Median number of young policyholders is 85 , less than median 136"
# Print and compare average claims from young policyholders with the overall average claims
avgYclaims <- round(mean(young$YClaims), 2)
paste("Average insurance claims from young group is", avgYclaims, ifelse(avgYclaims<avgClaims, ", less than average", ", more than or equal to average"), avgClaims)
## [1] "Average insurance claims from young group is 22.56 , less than average 49.23"
# Print and compare median claims from young policyholders with the overall media claims
medYclaims <- median(young$YClaims)
paste("Median insurance claims from young group is", medYclaims, ifelse(medYclaims<medClaims, ", less than average", ", more than or equal to average"), medClaims)
## [1] "Median insurance claims from young group is 19 , less than average 22"
# 5. For at least 3 values in a column please rename so that every value in that column is renamed.
# Replace lowercase l with uppercase L for all values under the column "YGroup""
young$YGroup <- gsub("l", "L", young$YGroup)
young
## Y YDistrict YGroup YAge YHolders YClaims
## 1 1 1 <1L <25 197 38
## 5 5 1 1-1.5L <25 284 63
## 9 9 1 1.5-2L <25 133 19
## 17 17 2 <1L <25 85 22
## 21 21 2 1-1.5L <25 149 25
## 25 25 2 1.5-2L <25 66 14
## 33 33 3 <1L <25 35 5
## 37 37 3 1-1.5L <25 53 10
## 53 53 4 1-1.5L <25 31 7