library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
df <- read_excel("churn.xlsx")

head(df)
## # A tibble: 6 × 14
##   RowNumber CustomerId Surname CreditScore Geography Gender   Age Tenure Balance
##       <dbl>      <dbl> <chr>         <dbl> <chr>     <chr>  <dbl>  <dbl> <chr>  
## 1         1   15634602 Hargra…         619 France    Female    NA      2 0      
## 2         2   15647311 Hill            608 <NA>      Female    41      1 83807.…
## 3         3   15619304 Onio            502 France    Female    42      8 159660…
## 4         4   15701354 Boni            699 France    Female    39      1 0      
## 5         5   15737888 Mitche…         850 <NA>      Female    43      2 125510…
## 6         6   15574012 Chu             645 Spain     Male      44      8 113755…
## # ℹ 5 more variables: NumOfProducts <dbl>, HasCrCard <dbl>,
## #   IsActiveMember <dbl>, EstimatedSalary <chr>, Exited <dbl>
colSums(is.na(df))
##       RowNumber      CustomerId         Surname     CreditScore       Geography 
##               0               0               0               0               3 
##          Gender             Age          Tenure         Balance   NumOfProducts 
##               3               3               0               2               0 
##       HasCrCard  IsActiveMember EstimatedSalary          Exited 
##               0               0               0               0
str(df)
## tibble [10,000 × 14] (S3: tbl_df/tbl/data.frame)
##  $ RowNumber      : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ CustomerId     : num [1:10000] 15634602 15647311 15619304 15701354 15737888 ...
##  $ Surname        : chr [1:10000] "Hargrave" "Hill" "Onio" "Boni" ...
##  $ CreditScore    : num [1:10000] 619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : chr [1:10000] "France" NA "France" "France" ...
##  $ Gender         : chr [1:10000] "Female" "Female" "Female" "Female" ...
##  $ Age            : num [1:10000] NA 41 42 39 43 44 NA 29 44 27 ...
##  $ Tenure         : num [1:10000] 2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : chr [1:10000] "0" "83807.86" "159660.8" "0" ...
##  $ NumOfProducts  : num [1:10000] 1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : num [1:10000] 1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : num [1:10000] 1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: chr [1:10000] "101348.88" "112542.58" "113931.57" "93826.63" ...
##  $ Exited         : num [1:10000] 1 0 1 0 0 1 0 1 0 0 ...
df$Balance <- as.numeric(df$Balance)
df$Balance[is.na(df$Balance)] <- mean(df$Balance, na.rm=TRUE)
df$Age[is.na(df$Age)] <- mean(df$Age, na.rm=TRUE)
df$Gender[is.na(df$Gender)] <- names(sort(-table(df$Gender))[1])
df$Geography[is.na(df$Geography)] <- names(sort(-table(df$Geography))[1])
colSums(is.na(df))
##       RowNumber      CustomerId         Surname     CreditScore       Geography 
##               0               0               0               0               0 
##          Gender             Age          Tenure         Balance   NumOfProducts 
##               0               0               0               0               0 
##       HasCrCard  IsActiveMember EstimatedSalary          Exited 
##               0               0               0               0
#df <- as.data.frame(lapply(df, function(x) gsub("[[:punct:]]", "",x)))
str(df)
## tibble [10,000 × 14] (S3: tbl_df/tbl/data.frame)
##  $ RowNumber      : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ CustomerId     : num [1:10000] 15634602 15647311 15619304 15701354 15737888 ...
##  $ Surname        : chr [1:10000] "Hargrave" "Hill" "Onio" "Boni" ...
##  $ CreditScore    : num [1:10000] 619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : chr [1:10000] "France" "France" "France" "France" ...
##  $ Gender         : chr [1:10000] "Female" "Female" "Female" "Female" ...
##  $ Age            : num [1:10000] 38.9 41 42 39 43 ...
##  $ Tenure         : num [1:10000] 2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num [1:10000] 0 83808 159661 0 125511 ...
##  $ NumOfProducts  : num [1:10000] 1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : num [1:10000] 1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : num [1:10000] 1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: chr [1:10000] "101348.88" "112542.58" "113931.57" "93826.63" ...
##  $ Exited         : num [1:10000] 1 0 1 0 0 1 0 1 0 0 ...
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
dens <- density(df$Age)

plot_ly(
  data = df,
  x = ~Age,
  type ="histogram"
)%>%add_lines(
  x = ~dens$x,
  y = ~dens$y*length(df$Age),
  type = "scatter",
  mode = "line"
)
plot_ly(
  data = df,
  x = ~Gender,
  y = ~Age,
  type = "bar",
  marker = list(color=ifelse(df$Gender=="Female","red","grey"))
)
df_group <- df%>% group_by(Gender) %>% summarise(
  Age = mean(Age),
  Exited = sum(Exited)
)


plot_ly(
  data = df_group,
  x = ~Gender,
  y = ~Age,
  type = "bar",
  text = "Gender",
  marker = list(color=ifelse(df_group$Gender=="Female","red","grey"))
)%>%add_trace(
  x =~Gender,
  y = ~Exited,
  marker = list(color=ifelse(df_group$Gender=="Female","red","grey"))
)
plot_ly(
  data = df,
  x = ~Age,
  y = ~NumOfProducts,
  type = "scatter",
  mode ="lines"
)
plot_ly(
  data = df,
  y = ~Age,
  x = ~EstimatedSalary,
  type = "scatter",
  mode ="markers"
)
plot_ly(
  data = df,
  y = ~Age,
  type = "box",
  boxmean = TRUE
)%>% layout(
  yaxis = list(range = c(0, max(df$Age)+10))
)
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
dh <- read.csv("ds_salaries.csv")


df_new <- dh%>% group_by(work_year) %>% summarise(
  mean_salary = mean(salary)
)

plot_ly(
  data = df_new,
  x = ~work_year,
  y = ~mean_salary,
  type = "scatter",
  mode = "marker"
)%>%layout(
  title = "Linear",
  xaxis = list(title="Year"),
  yaxis = list(title="Mean Salary", range=c(0,600000))
)
str(mpg)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
# Plot stacked bar berdasarkan jenis mobil dan transmisi
ggplot(mpg, aes(x = class, fill = drv)) +
  geom_bar()

head(mpg,100)
## # A tibble: 100 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # ℹ 90 more rows
library(ggplot2)
data("mpg")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mpgg <- mpg %>% count(class, drv) %>% arrange(class,drv)

plot_ly(
  data = mpgg,
  x = ~class,
  y = ~n,
  color = ~drv,
  type="bar",
  hoverinfo ="text",
  text = ~paste(class,drv,n)
)%>%layout(
  barmode ="stack"
)
cor(df[,c("Age","NumOfProducts","CreditScore")])
##                        Age NumOfProducts  CreditScore
## Age            1.000000000   -0.03062867 -0.004032286
## NumOfProducts -0.030628669    1.00000000  0.012237879
## CreditScore   -0.004032286    0.01223788  1.000000000
# Gunakan dataset mtcars
data(mtcars)

# Buat plotly scatter plot interaktif
plot <- plot_ly(data = mtcars, x = ~mpg, y = ~hp, type = "choropleth",
                marker = list(size = 10, opacity = 0.8))

# Tampilkan plot
plot
## Warning: 'choropleth' objects don't have these attributes: 'x', 'y'
## Valid attributes include:
## 'autocolorscale', 'coloraxis', 'colorbar', 'colorscale', 'customdata', 'customdatasrc', 'featureidkey', 'geo', 'geojson', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'locationmode', 'locations', 'locationssrc', 'marker', 'meta', 'metasrc', 'name', 'reversescale', 'selected', 'selectedpoints', 'showlegend', 'showscale', 'stream', 'text', 'textsrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'z', 'zauto', 'zmax', 'zmid', 'zmin', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'