This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
setwd("G:\\GSSC\\Analytics\\AO GSSC\\Jessica\\DATAcademy\\ProbSet")
getwd()
## [1] "G:/GSSC/Analytics/AO GSSC/Jessica/DATAcademy/ProbSet"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
library(MASS)
## Warning: package 'MASS' was built under R version 3.3.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(FSA)
## Warning: package 'FSA' was built under R version 3.3.3
## ## FSA v0.8.13. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
##
## Attaching package: 'FSA'
## The following object is masked from 'package:psych':
##
## headtail
data<-read.csv("WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv", header=TRUE,stringsAsFactors = TRUE)
dim(data)
## [1] 9134 24
null<-sapply(data,function(y) sum(is.na(y)))
null<-data.frame(null)
null
## null
## Customer 0
## State 0
## Customer.Lifetime.Value 0
## Response 0
## Coverage 0
## Education 0
## Effective.To.Date 0
## EmploymentStatus 0
## Gender 0
## Income 0
## Location.Code 0
## Marital.Status 0
## Monthly.Premium.Auto 0
## Months.Since.Last.Claim 0
## Months.Since.Policy.Inception 0
## Number.of.Open.Complaints 0
## Number.of.Policies 0
## Policy.Type 0
## Policy 0
## Renew.Offer.Type 0
## Sales.Channel 0
## Total.Claim.Amount 0
## Vehicle.Class 0
## Vehicle.Size 0
dups<-data[duplicated(data),]
dups
## [1] Customer State
## [3] Customer.Lifetime.Value Response
## [5] Coverage Education
## [7] Effective.To.Date EmploymentStatus
## [9] Gender Income
## [11] Location.Code Marital.Status
## [13] Monthly.Premium.Auto Months.Since.Last.Claim
## [15] Months.Since.Policy.Inception Number.of.Open.Complaints
## [17] Number.of.Policies Policy.Type
## [19] Policy Renew.Offer.Type
## [21] Sales.Channel Total.Claim.Amount
## [23] Vehicle.Class Vehicle.Size
## <0 rows> (or 0-length row.names)
str(data)
## 'data.frame': 9134 obs. of 24 variables:
## $ Customer : Factor w/ 9134 levels "AA10041","AA11235",..: 601 5947 97 8017 2489 4948 8434 756 1352 548 ...
## $ State : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
## $ Customer.Lifetime.Value : num 2764 6980 12887 7646 2814 ...
## $ Response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
## $ Effective.To.Date : Factor w/ 59 levels "1/1/11","1/10/11",..: 48 25 42 13 53 18 48 10 19 40 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
## $ Income : int 56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
## $ Monthly.Premium.Auto : int 69 94 108 106 73 69 67 101 71 93 ...
## $ Months.Since.Last.Claim : int 32 13 18 18 12 14 0 0 13 17 ...
## $ Months.Since.Policy.Inception: int 5 42 38 65 44 94 13 68 3 7 ...
## $ Number.of.Open.Complaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Policies : int 1 8 2 7 1 2 9 4 2 8 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
## $ Policy : Factor w/ 9 levels "Corporate L1",..: 3 6 6 2 4 6 3 3 3 8 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
## $ Total.Claim.Amount : num 385 1131 566 530 138 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
data$Customer<-as.character(data$Customer)
data$Education<-factor(data$Education,levels = c("High School or Below",
"College","Bachelor","Master"
,"Doctor"),ordered = TRUE)
data$Effective.To.Date<-as.Date(data$Effective.To.Date,"%m/%d/%y")
STATS_CLV<-data.frame(t(describe(data$Customer.Lifetime.Value)))
STATS_CLV
## X1
## vars 1.000000
## n 9134.000000
## mean 8004.940475
## sd 6870.967608
## median 5780.182197
## trimmed 6631.050152
## mad 3658.899063
## min 1898.007675
## max 83325.381190
## range 81427.373515
## skew 3.031284
## kurtosis 13.811629
## se 71.893131
hist(data$Customer.Lifetime.Value,col="blue")
STATS_TCA<-data.frame(t(describe(data$Total.Claim.Amount)))
STATS_TCA
## X1
## vars 1.000000
## n 9134.000000
## mean 434.088794
## sd 290.500092
## median 383.945434
## trimmed 402.676723
## mad 213.575300
## min 0.099007
## max 2893.239678
## range 2893.140671
## skew 1.714403
## kurtosis 5.973506
## se 3.039595
hist(data$Total.Claim.Amount,col="blue")
kruskal.test(Customer.Lifetime.Value ~ Location.Code, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Customer.Lifetime.Value by Location.Code
## Kruskal-Wallis chi-squared = 2.4638, df = 2, p-value = 0.2917
kruskal.test(Monthly.Premium.Auto ~ Location.Code, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Monthly.Premium.Auto by Location.Code
## Kruskal-Wallis chi-squared = 35.493, df = 2, p-value = 1.962e-08
kruskal.test(Total.Claim.Amount ~ Location.Code, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Total.Claim.Amount by Location.Code
## Kruskal-Wallis chi-squared = 5249.9, df = 2, p-value < 2.2e-16
dunnTest(Monthly.Premium.Auto~Location.Code,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Rural - Suburban -4.765825959 1.880816e-06 5.642447e-06
## 2 Rural - Urban -0.004732278 9.962242e-01 9.962242e-01
## 3 Suburban - Urban 4.554072363 5.261724e-06 7.892586e-06
dunnTest(Total.Claim.Amount~Location.Code,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Rural - Suburban -69.93754 0.000000e+00 0.000000e+00
## 2 Rural - Urban -26.87015 4.904913e-159 4.904913e-159
## 3 Suburban - Urban 34.16409 8.260262e-256 1.239039e-255
kruskal.test(Customer.Lifetime.Value ~ Education, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Customer.Lifetime.Value by Education
## Kruskal-Wallis chi-squared = 12.234, df = 4, p-value = 0.01569
kruskal.test(Monthly.Premium.Auto ~ Education, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Monthly.Premium.Auto by Education
## Kruskal-Wallis chi-squared = 11.766, df = 4, p-value = 0.01918
kruskal.test(Total.Claim.Amount ~ Education, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Total.Claim.Amount by Education
## Kruskal-Wallis chi-squared = 234.87, df = 4, p-value < 2.2e-16
dunnTest(Customer.Lifetime.Value~Education,data=data,method="bh")
## Warning in if (tmp$Eclass != "factor") {: the condition has length > 1 and
## only the first element will be used
## Warning in if (tmp$Eclass == "numeric") STOP("RHS variable must be a
## factor."): the condition has length > 1 and only the first element will be
## used
## Warning: Education was coerced to a factor.
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Bachelor - College -0.3423319 0.73210115 0.81344573
## 2 Bachelor - Doctor 0.9165031 0.35940307 0.44925384
## 3 College - Doctor 1.0770808 0.28144415 0.40206307
## 4 Bachelor - High School or Below -2.7540966 0.00588544 0.05885440
## 5 College - High School or Below -2.3991050 0.01643520 0.08217600
## 6 Doctor - High School or Below -2.2218471 0.02629364 0.08764547
## 7 Bachelor - Master -1.6008027 0.10942062 0.21884125
## 8 College - Master -1.3726642 0.16985677 0.28309461
## 9 Doctor - Master -1.8175241 0.06913691 0.17284227
## 10 High School or Below - Master 0.2144934 0.83016235 0.83016235
dunnTest(Monthly.Premium.Auto~Education,data=data,method="bh")
## Warning in if (tmp$Eclass != "factor") {: the condition has length > 1 and
## only the first element will be used
## Warning in if (tmp$Eclass == "numeric") STOP("RHS variable must be a
## factor."): the condition has length > 1 and only the first element will be
## used
## Warning: Education was coerced to a factor.
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Bachelor - College -1.2509845 0.210940148 0.35156691
## 2 Bachelor - Doctor 0.4469886 0.654883313 0.65488331
## 3 College - Doctor 1.0377936 0.299366139 0.42766591
## 4 Bachelor - High School or Below -3.1559036 0.001600017 0.01600017
## 5 College - High School or Below -1.9004236 0.057377550 0.19125850
## 6 Doctor - High School or Below -1.9443726 0.051850534 0.25925267
## 7 Bachelor - Master -1.3962841 0.162628961 0.40657240
## 8 College - Master -0.5743687 0.565718348 0.62857594
## 9 Doctor - Master -1.2761959 0.201886290 0.40377258
## 10 High School or Below - Master 0.6816348 0.495469878 0.61933735
dunnTest(Total.Claim.Amount~Education,data=data,method="bh")
## Warning in if (tmp$Eclass != "factor") {: the condition has length > 1 and
## only the first element will be used
## Warning in if (tmp$Eclass == "numeric") STOP("RHS variable must be a
## factor."): the condition has length > 1 and only the first element will be
## used
## Warning: Education was coerced to a factor.
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Bachelor - College 0.27118828 7.862462e-01 8.736069e-01
## 2 Bachelor - Doctor 5.17099833 2.328467e-07 3.326381e-07
## 3 College - Doctor 5.03565335 4.762217e-07 5.952771e-07
## 4 Bachelor - High School or Below -8.61052713 7.272538e-18 1.818135e-17
## 5 College - High School or Below -8.82649625 1.080060e-18 3.600199e-18
## 6 Doctor - High School or Below -9.24597533 2.331049e-20 1.165525e-19
## 7 Bachelor - Master 7.30854174 2.700571e-13 5.401141e-13
## 8 College - Master 7.11184867 1.144988e-12 1.908313e-12
## 9 Doctor - Master 0.09211263 9.266086e-01 9.266086e-01
## 10 High School or Below - Master 12.92160404 3.399791e-38 3.399791e-37
kruskal.test(Customer.Lifetime.Value ~ Marital.Status, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Customer.Lifetime.Value by Marital.Status
## Kruskal-Wallis chi-squared = 20.896, df = 2, p-value = 2.901e-05
kruskal.test(Monthly.Premium.Auto ~ Marital.Status, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Monthly.Premium.Auto by Marital.Status
## Kruskal-Wallis chi-squared = 0.84235, df = 2, p-value = 0.6563
kruskal.test(Total.Claim.Amount ~ Marital.Status, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Total.Claim.Amount by Marital.Status
## Kruskal-Wallis chi-squared = 570.33, df = 2, p-value < 2.2e-16
dunnTest(Customer.Lifetime.Value~Marital.Status,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Divorced - Married -0.3153141 7.525232e-01 7.525232e-01
## 2 Divorced - Single 2.9633445 3.043158e-03 4.564736e-03
## 3 Married - Single 4.4895799 7.136379e-06 2.140914e-05
dunnTest(Monthly.Premium.Auto~Marital.Status,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Divorced - Married -0.8755608 0.3812688 1.0000000
## 2 Divorced - Single -0.8183699 0.4131460 0.6197190
## 3 Married - Single -0.0424569 0.9661345 0.9661345
dunnTest(Total.Claim.Amount~Marital.Status,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Divorced - Married 2.706153 6.806771e-03 6.806771e-03
## 2 Divorced - Single -14.656871 1.217562e-48 1.826343e-48
## 3 Married - Single -23.631936 1.810485e-123 5.431456e-123
kruskal.test(Customer.Lifetime.Value ~ State, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Customer.Lifetime.Value by State
## Kruskal-Wallis chi-squared = 5.0721, df = 4, p-value = 0.28
kruskal.test(Monthly.Premium.Auto ~ State, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Monthly.Premium.Auto by State
## Kruskal-Wallis chi-squared = 10.384, df = 4, p-value = 0.03444
kruskal.test(Total.Claim.Amount ~ State, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Total.Claim.Amount by State
## Kruskal-Wallis chi-squared = 0.95493, df = 4, p-value = 0.9166
dunnTest(Monthly.Premium.Auto~State,data=data,method="bh")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Benjamini-Hochberg method.
## Comparison Z P.unadj P.adj
## 1 Arizona - California -1.8486482 0.064508625 0.16127156
## 2 Arizona - Nevada -2.3866345 0.017003386 0.08501693
## 3 California - Nevada -1.1394116 0.254531511 0.31816439
## 4 Arizona - Oregon -1.3994994 0.161663267 0.23094752
## 5 California - Oregon 0.4521124 0.651188005 0.65118801
## 6 Nevada - Oregon 1.4213960 0.155201671 0.25866945
## 7 Arizona - Washington 0.6770269 0.498388875 0.55376542
## 8 California - Washington 2.1358855 0.032688746 0.10896249
## 9 Nevada - Washington 2.6210194 0.008766728 0.08766728
## 10 Oregon - Washington 1.7957332 0.072536961 0.14507392
kruskal.test(Customer.Lifetime.Value ~ Sales.Channel, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Customer.Lifetime.Value by Sales.Channel
## Kruskal-Wallis chi-squared = 4.4918, df = 3, p-value = 0.213
kruskal.test(Monthly.Premium.Auto ~ Sales.Channel, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Monthly.Premium.Auto by Sales.Channel
## Kruskal-Wallis chi-squared = 2.2758, df = 3, p-value = 0.5172
kruskal.test(Total.Claim.Amount ~ Sales.Channel, data=data)
##
## Kruskal-Wallis rank sum test
##
## data: Total.Claim.Amount by Sales.Channel
## Kruskal-Wallis chi-squared = 0.77009, df = 3, p-value = 0.8566
policy_cal<-mutate(summarize(group_by(filter(data, State=="California"),Policy.Type), Num_Pol=sum(Number.of.Policies)),
Ratio=Num_Pol/sum(Num_Pol))
policy_cal
## # A tibble: 3 × 3
## Policy.Type Num_Pol Ratio
## <fctr> <int> <dbl>
## 1 Corporate Auto 2174 0.23256312
## 2 Personal Auto 6838 0.73149337
## 3 Special Auto 336 0.03594352
policy_all<-mutate(summarize(group_by(data,Policy.Type), Num_Pol=sum(Number.of.Policies)),
Ratio=Num_Pol/sum(Num_Pol))
policy_all
## # A tibble: 3 × 3
## Policy.Type Num_Pol Ratio
## <fctr> <int> <dbl>
## 1 Corporate Auto 5873 0.21677186
## 2 Personal Auto 20132 0.74307017
## 3 Special Auto 1088 0.04015797
top_cal<-head(arrange(filter(data,State=="California"),desc(Customer.Lifetime.Value)),n=10)
top_cal<-subset(top_cal,select=c(Customer,Customer.Lifetime.Value,Education,Marital.Status,Location.Code))
ch_cal<-summarise(group_by(filter(data,State=="California"),Sales.Channel),AVE_CLV=mean(Customer.Lifetime.Value),
AVE_MPA=mean(Monthly.Premium.Auto),AVE_TCA=mean(Total.Claim.Amount),
TOT_NOP=ceiling(sum(Number.of.Policies)))
ch_cal
## # A tibble: 4 × 5
## Sales.Channel AVE_CLV AVE_MPA AVE_TCA TOT_NOP
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 Agent 7878.525 94.05495 443.6984 3523
## 2 Branch 8204.506 94.01556 446.8240 2675
## 3 Call Center 8076.694 91.84959 423.8503 1816
## 4 Web 7835.420 93.16892 422.6979 1334
ch_all<-summarise(group_by(data,Sales.Channel),AVE_CLV=mean(Customer.Lifetime.Value),
AVE_MPA=mean(Monthly.Premium.Auto),AVE_TCA=mean(Total.Claim.Amount),
TOT_NOP=ceiling(sum(Number.of.Policies)))
ch_all
## # A tibble: 4 × 5
## Sales.Channel AVE_CLV AVE_MPA AVE_TCA TOT_NOP
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 Agent 7957.709 93.67903 438.4347 10247
## 2 Branch 8119.712 93.30697 432.8668 7519
## 3 Call Center 8100.086 92.48215 428.1246 5306
## 4 Web 7779.788 92.82491 432.9967 4021
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.