Summary Statistics

library(skimr)
library(visdat)
library(tidyverse)
library(plotly)
library(knitr)
library(glancedata)
library(faraway)
data("teengamb")
kable(head(teengamb, col.names = c("Sex", "Status", "Income", "Verbal", "Gamble")))

sex	status	income	verbal	gamble
1	51	2.00	8	0.0
1	28	2.50	8	0.0
1	37	2.00	6	0.0
1	28	7.00	4	7.3
1	65	2.00	8	19.6
1	61	3.47	6	0.1

skim(teengamb)

Data summary
Name	teengamb
Number of rows	47
Number of columns	5
_______________________
Column type frequency:
numeric	5
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
sex	1	0.40	0.50	0.0	0.0	0.00	1.00	1	▇▁▁▁▆
status	1	45.23	17.26	18.0	28.0	43.00	61.50	75	▇▅▇▃▆
income	1	4.64	3.55	0.6	2.0	3.25	6.21	15	▇▃▂▂▁
verbal	1	6.66	1.86	1.0	6.0	7.00	8.00	10	▁▂▆▇▃
gamble	1	19.30	31.52	0.0	1.1	6.00	19.40	156	▇▁▁▁▁

glancedata::glance_data(teengamb)

glancedata::glance_data_in_workbook(teengamb)

## $all
## # A tibble: 5 x 11
##   name  type  distinct_values minimum median maximum   mean     sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl>         <dbl>
## 1 sex   nume…               2     0     0          1  0.404  0.496             0
## 2 stat… nume…              16    18    43         75 45.2   17.3               0
## 3 inco… nume…              26     0.6   3.25      15  4.64   3.55              0
## 4 verb… nume…               9     1     7         10  6.66   1.86              0
## 5 gamb… nume…              36     0     6        156 19.3   31.5               0
## # … with 2 more variables: count <chr>, sample_values <chr>
## 
## $summary
## # A tibble: 2 x 2
##   cat           n
##   <chr>     <int>
## 1 binary        1
## 2 numerical     4
## 
## $all_nas
## # A tibble: 0 x 6
## # … with 6 variables: name <chr>, type <chr>, distinct_values <int>,
## #   na_proportion <dbl>, count <chr>, sample_values <chr>
## 
## $single_value
## # A tibble: 0 x 11
## # … with 11 variables: name <chr>, type <chr>, distinct_values <int>,
## #   minimum <dbl>, median <dbl>, maximum <dbl>, mean <dbl>, sd <dbl>,
## #   na_proportion <dbl>, count <chr>, sample_values <chr>
## 
## $binary
## # A tibble: 1 x 11
##   name  type  distinct_values minimum median maximum  mean    sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl> <dbl> <dbl>         <dbl>
## 1 sex   nume…               2       0      0       1 0.404 0.496             0
## # … with 2 more variables: count <chr>, sample_values <chr>
## 
## $numerical
## # A tibble: 4 x 10
##   name  type  distinct_values minimum median maximum  mean    sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl> <dbl> <dbl>         <dbl>
## 1 stat… nume…              16    18    43         75 45.2  17.3              0
## 2 inco… nume…              26     0.6   3.25      15  4.64  3.55             0
## 3 verb… nume…               9     1     7         10  6.66  1.86             0
## 4 gamb… nume…              36     0     6        156 19.3  31.5              0
## # … with 1 more variable: sample_values <chr>
## 
## $categorical
## # A tibble: 0 x 5
## # … with 5 variables: name <chr>, distinct_values <int>, na_proportion <dbl>,
## #   count <chr>, sample_values <chr>

glancedata::plot_discrete_vars(teengamb)

## TableGrob (2 x 2) "arrange": 3 grobs
##        z     cells    name               grob
## sex    1 (1-1,1-1) arrange     gtable[layout]
## verbal 2 (1-1,2-2) arrange     gtable[layout]
##        3 (2-2,1-2) arrange text[GRID.text.89]

glancedata::plot_numerical_vars(teengamb,plot_type = "histogram")

glancedata::plot_numerical_vars(teengamb,plot_type = "density")

glancedata::plot_numerical_vars(teengamb, plot_type = "boxplot")

glancedata::plot_numerical_vars(teengamb, plot_type = "qqplot")

glancedata::plot_numerical_vars(teengamb, plot_type = "pairwise")

teen = teengamb
teen$income = teen$income[teen$income>70]<-NA
teen$gamble = teen$gamble[teen$gamble>30]<-NA
teen$verbal= teen$verbal[teen$verbal<3]<-NA

visdat::vis_miss(teen,sort_miss = T, show_perc = T, cluster = T)

teengamb$sex = as.factor(teengamb$sex)
levels(teengamb$sex) = c("Male", "Female")
p = ggplot(teengamb, aes(x =status, y = income, color = sex)) + geom_point()
ggplotly(p)