# Read insurace.csv file from github and store it in a data frame.
insurance <- read.csv("https://raw.githubusercontent.com/miachen410/InsuranceClaims/master/Insurance.csv")
# 1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.
summary(insurance)
##        X            District       Group       Age        Holders       
##  Min.   : 1.00   Min.   :1.00   <1l   :16   <25  :16   Min.   :   3.00  
##  1st Qu.:16.75   1st Qu.:1.75   >2l   :16   >35  :16   1st Qu.:  46.75  
##  Median :32.50   Median :2.50   1-1.5l:16   25-29:16   Median : 136.00  
##  Mean   :32.50   Mean   :2.50   1.5-2l:16   30-35:16   Mean   : 364.98  
##  3rd Qu.:48.25   3rd Qu.:3.25                          3rd Qu.: 327.50  
##  Max.   :64.00   Max.   :4.00                          Max.   :3582.00  
##      Claims      
##  Min.   :  0.00  
##  1st Qu.:  9.50  
##  Median : 22.00  
##  Mean   : 49.23  
##  3rd Qu.: 55.50  
##  Max.   :400.00
cat("\n") #print a blank line
# Print average number of policyholders
avgHolders <- round(mean(insurance$Holders), 2)
paste("Average number of policyholders is", avgHolders)
## [1] "Average number of policyholders is 364.98"
# Print median number of policyholders
medHolders <- median(insurance$Holders)
paste("Median number of policyholders is", medHolders)
## [1] "Median number of policyholders is 136"
# Print average insurance claims 
avgClaims <- round(mean(insurance$Claims), 2)
paste("Average number of insurance claims is", avgClaims)
## [1] "Average number of insurance claims is 49.23"
# Print median insurance claims 
medClaims <- median(insurance$Claims)
paste("Median number of insurance claims is", medClaims)
## [1] "Median number of insurance claims is 22"
# 2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# Subset groups of policyholders under age 25 that have at least 30 members
young <- subset(insurance, Age == "<25" & Holders >= 30)
young
##     X District  Group Age Holders Claims
## 1   1        1    <1l <25     197     38
## 5   5        1 1-1.5l <25     284     63
## 9   9        1 1.5-2l <25     133     19
## 17 17        2    <1l <25      85     22
## 21 21        2 1-1.5l <25     149     25
## 25 25        2 1.5-2l <25      66     14
## 33 33        3    <1l <25      35      5
## 37 37        3 1-1.5l <25      53     10
## 53 53        4 1-1.5l <25      31      7
# 3. Create new column names for the new data frame.
names(young) <- c("Y", "YDistrict", "YGroup", "YAge", "YHolders", "YClaims")
young
##     Y YDistrict YGroup YAge YHolders YClaims
## 1   1         1    <1l  <25      197      38
## 5   5         1 1-1.5l  <25      284      63
## 9   9         1 1.5-2l  <25      133      19
## 17 17         2    <1l  <25       85      22
## 21 21         2 1-1.5l  <25      149      25
## 25 25         2 1.5-2l  <25       66      14
## 33 33         3    <1l  <25       35       5
## 37 37         3 1-1.5l  <25       53      10
## 53 53         4 1-1.5l  <25       31       7
# 4. Use the summary function to create an overview of your new data frame. Then print the mean and median for the same two attributes. Please compare.
summary(young)
##        Y           YDistrict        YGroup     YAge      YHolders    
##  Min.   : 1.00   Min.   :1.000   <1l   :3   <25  :9   Min.   : 31.0  
##  1st Qu.: 9.00   1st Qu.:1.000   >2l   :0   >35  :0   1st Qu.: 53.0  
##  Median :21.00   Median :2.000   1-1.5l:4   25-29:0   Median : 85.0  
##  Mean   :22.33   Mean   :2.111   1.5-2l:2   30-35:0   Mean   :114.8  
##  3rd Qu.:33.00   3rd Qu.:3.000                        3rd Qu.:149.0  
##  Max.   :53.00   Max.   :4.000                        Max.   :284.0  
##     YClaims     
##  Min.   : 5.00  
##  1st Qu.:10.00  
##  Median :19.00  
##  Mean   :22.56  
##  3rd Qu.:25.00  
##  Max.   :63.00
cat("\n") #print a blank line
# Print and compare average number of young policyholders with the overall average
avgYholders <- round(mean(young$YHolders), 2)
paste("Average number of young policyholders is", avgYholders, ifelse(avgYholders<avgHolders, ", less than average", ", more than or equal to average"), avgHolders)
## [1] "Average number of young policyholders is 114.78 , less than average 364.98"
# Print and compare median number of young policyholders with the overall median
medYholders <- median(young$YHolders)
paste("Median number of young policyholders is", medYholders, ifelse(medYholders<=medHolders, ", less than median", ", more than or equal to median"), medHolders)
## [1] "Median number of young policyholders is 85 , less than median 136"
# Print and compare average claims from young policyholders with the overall average claims
avgYclaims <- round(mean(young$YClaims), 2)
paste("Average insurance claims from young group is", avgYclaims, ifelse(avgYclaims<avgClaims, ", less than average", ", more than or equal to average"), avgClaims)
## [1] "Average insurance claims from young group is 22.56 , less than average 49.23"
# Print and compare median claims from young policyholders with the overall media claims
medYclaims <- median(young$YClaims)
paste("Median insurance claims from young group is", medYclaims, ifelse(medYclaims<medClaims, ", less than average", ", more than or equal to average"), medClaims)
## [1] "Median insurance claims from young group is 19 , less than average 22"
# 5. For at least 3 values in a column please rename so that every value in that column is renamed. 
# Replace lowercase l with uppercase L for all values under the column "YGroup""
young$YGroup <- gsub("l", "L", young$YGroup) 
young
##     Y YDistrict YGroup YAge YHolders YClaims
## 1   1         1    <1L  <25      197      38
## 5   5         1 1-1.5L  <25      284      63
## 9   9         1 1.5-2L  <25      133      19
## 17 17         2    <1L  <25       85      22
## 21 21         2 1-1.5L  <25      149      25
## 25 25         2 1.5-2L  <25       66      14
## 33 33         3    <1L  <25       35       5
## 37 37         3 1-1.5L  <25       53      10
## 53 53         4 1-1.5L  <25       31       7