library(skimr)
library(visdat)
library(tidyverse)
library(plotly)
library(knitr)
library(glancedata)
library(faraway)
data("teengamb")
kable(head(teengamb, col.names = c("Sex", "Status", "Income", "Verbal", "Gamble")))
sex status income verbal gamble
1 51 2.00 8 0.0
1 28 2.50 8 0.0
1 37 2.00 6 0.0
1 28 7.00 4 7.3
1 65 2.00 8 19.6
1 61 3.47 6 0.1
skim(teengamb)
Data summary
Name teengamb
Number of rows 47
Number of columns 5
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sex 0 1 0.40 0.50 0.0 0.0 0.00 1.00 1 ▇▁▁▁▆
status 0 1 45.23 17.26 18.0 28.0 43.00 61.50 75 ▇▅▇▃▆
income 0 1 4.64 3.55 0.6 2.0 3.25 6.21 15 ▇▃▂▂▁
verbal 0 1 6.66 1.86 1.0 6.0 7.00 8.00 10 ▁▂▆▇▃
gamble 0 1 19.30 31.52 0.0 1.1 6.00 19.40 156 ▇▁▁▁▁
glancedata::glance_data(teengamb)
glancedata::glance_data_in_workbook(teengamb)
## $all
## # A tibble: 5 x 11
##   name  type  distinct_values minimum median maximum   mean     sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl>         <dbl>
## 1 sex   nume…               2     0     0          1  0.404  0.496             0
## 2 stat… nume…              16    18    43         75 45.2   17.3               0
## 3 inco… nume…              26     0.6   3.25      15  4.64   3.55              0
## 4 verb… nume…               9     1     7         10  6.66   1.86              0
## 5 gamb… nume…              36     0     6        156 19.3   31.5               0
## # … with 2 more variables: count <chr>, sample_values <chr>
## 
## $summary
## # A tibble: 2 x 2
##   cat           n
##   <chr>     <int>
## 1 binary        1
## 2 numerical     4
## 
## $all_nas
## # A tibble: 0 x 6
## # … with 6 variables: name <chr>, type <chr>, distinct_values <int>,
## #   na_proportion <dbl>, count <chr>, sample_values <chr>
## 
## $single_value
## # A tibble: 0 x 11
## # … with 11 variables: name <chr>, type <chr>, distinct_values <int>,
## #   minimum <dbl>, median <dbl>, maximum <dbl>, mean <dbl>, sd <dbl>,
## #   na_proportion <dbl>, count <chr>, sample_values <chr>
## 
## $binary
## # A tibble: 1 x 11
##   name  type  distinct_values minimum median maximum  mean    sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl> <dbl> <dbl>         <dbl>
## 1 sex   nume…               2       0      0       1 0.404 0.496             0
## # … with 2 more variables: count <chr>, sample_values <chr>
## 
## $numerical
## # A tibble: 4 x 10
##   name  type  distinct_values minimum median maximum  mean    sd na_proportion
##   <chr> <chr>           <int>   <dbl>  <dbl>   <dbl> <dbl> <dbl>         <dbl>
## 1 stat… nume…              16    18    43         75 45.2  17.3              0
## 2 inco… nume…              26     0.6   3.25      15  4.64  3.55             0
## 3 verb… nume…               9     1     7         10  6.66  1.86             0
## 4 gamb… nume…              36     0     6        156 19.3  31.5              0
## # … with 1 more variable: sample_values <chr>
## 
## $categorical
## # A tibble: 0 x 5
## # … with 5 variables: name <chr>, distinct_values <int>, na_proportion <dbl>,
## #   count <chr>, sample_values <chr>
glancedata::plot_discrete_vars(teengamb)
## TableGrob (2 x 2) "arrange": 3 grobs
##        z     cells    name               grob
## sex    1 (1-1,1-1) arrange     gtable[layout]
## verbal 2 (1-1,2-2) arrange     gtable[layout]
##        3 (2-2,1-2) arrange text[GRID.text.89]
glancedata::plot_numerical_vars(teengamb,plot_type = "histogram")
glancedata::plot_numerical_vars(teengamb,plot_type = "density")
glancedata::plot_numerical_vars(teengamb, plot_type = "boxplot")
glancedata::plot_numerical_vars(teengamb, plot_type = "qqplot")
glancedata::plot_numerical_vars(teengamb, plot_type = "pairwise")

teen = teengamb
teen$income = teen$income[teen$income>70]<-NA
teen$gamble = teen$gamble[teen$gamble>30]<-NA
teen$verbal= teen$verbal[teen$verbal<3]<-NA

visdat::vis_miss(teen,sort_miss = T, show_perc = T, cluster = T)

teengamb$sex = as.factor(teengamb$sex)
levels(teengamb$sex) = c("Male", "Female")
p = ggplot(teengamb, aes(x =status, y = income, color = sex)) + geom_point()
ggplotly(p)