Read Data

data_up <- read.csv("C:/Users/mhdha/Downloads/startup_data.csv")
tail(data_up)
##     Startup.Name   Industry Funding.Rounds Funding.Amount..M.USD.
## 495  Startup_495 E-Commerce              4                 114.12
## 496  Startup_496     EdTech              2                 181.86
## 497  Startup_497         AI              2                 107.34
## 498  Startup_498 E-Commerce              1                 160.29
## 499  Startup_499     Gaming              5                 234.65
## 500  Startup_500 HealthTech              4                 211.76
##     Valuation..M.USD. Revenue..M.USD. Employees Market.Share.... Profitable
## 495           1503.71           79.19      4014             1.13          1
## 496           2378.65           59.64      3331             0.58          1
## 497           1394.58           10.22      2223             5.85          0
## 498            502.09           84.73      2222             4.32          0
## 499           2814.52           53.16      4972             5.53          0
## 500           2563.17           84.19      2374             5.26          0
##     Year.Founded        Region Exit.Status
## 495         2015        Europe         IPO
## 496         1993        Europe     Private
## 497         2019 South America     Private
## 498         2019     Australia     Private
## 499         2011        Europe     Private
## 500         2000 North America     Private
summary (data_up)
##  Startup.Name         Industry         Funding.Rounds  Funding.Amount..M.USD.
##  Length:500         Length:500         Min.   :1.000   Min.   :  0.57        
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.: 79.21        
##  Mode  :character   Mode  :character   Median :3.000   Median :156.00        
##                                        Mean   :2.958   Mean   :152.66        
##                                        3rd Qu.:4.000   3rd Qu.:226.45        
##                                        Max.   :5.000   Max.   :299.81        
##  Valuation..M.USD. Revenue..M.USD.   Employees    Market.Share....
##  Min.   :   2.43   Min.   : 0.12   Min.   :  12   Min.   : 0.100  
##  1st Qu.: 557.03   1st Qu.:22.80   1st Qu.:1383   1st Qu.: 2.760  
##  Median :1222.58   Median :48.80   Median :2496   Median : 5.135  
##  Mean   :1371.81   Mean   :49.32   Mean   :2532   Mean   : 5.093  
##  3rd Qu.:2052.09   3rd Qu.:74.97   3rd Qu.:3709   3rd Qu.: 7.553  
##  Max.   :4357.49   Max.   :99.71   Max.   :4984   Max.   :10.000  
##    Profitable     Year.Founded     Region          Exit.Status       
##  Min.   :0.000   Min.   :1990   Length:500         Length:500        
##  1st Qu.:0.000   1st Qu.:1998   Class :character   Class :character  
##  Median :0.000   Median :2006   Mode  :character   Mode  :character  
##  Mean   :0.432   Mean   :2006                                        
##  3rd Qu.:1.000   3rd Qu.:2014                                        
##  Max.   :1.000   Max.   :2022
colnames(data_up)
##  [1] "Startup.Name"           "Industry"               "Funding.Rounds"        
##  [4] "Funding.Amount..M.USD." "Valuation..M.USD."      "Revenue..M.USD."       
##  [7] "Employees"              "Market.Share...."       "Profitable"            
## [10] "Year.Founded"           "Region"                 "Exit.Status"

Eksplorisasi Data

Histogram

# Mendefinisikan banyak amatan (n)
n <- 500

# Menghitung k
k1 <- sqrt(n)
k1
## [1] 22.36068
hist(data_up$Funding.Amount..M.USD., breaks=23, col="red",
 main="Sebaran Data Total Pendanaan (Funding Amount)")

hist(data_up$Valuation..M.USD., breaks=23, col="coral",
 main="Sebaran Data Penilaian")

hist(data_up$Revenue..M.USD., breaks=23, col="orange",
 main="Sebaran Data Pendapatan")

hist(data_up$Employees, breaks=23, col="yellow",
 main="Sebaran Data Jumlah Pekerja")

hist(data_up$Market.Share...., breaks=23, col="green",
 main="Sebaran Data Market Share")

Dari Histogram yang ditunjukkan, hampir semua histogram cenderung menyebar secara seragam, kecuali yang Histogram Penilaian. Histogramnya lebih ke menjulur ke kanan

Boxplot

boxplot(data_up$Funding.Amount..M.USD., breaks=30, col="red")

boxplot(data_up$Valuation..M.USD., col="coral")

boxplot(data_up$Revenue..M.USD., col="orange")

boxplot(data_up$Employees, col="yellow")

boxplot(data_up$Market.Share...., col="green")

Dari Boxplot di atas, hampir semua data tidak memiliki outlier kecuali pada Data Penilaian, itu memiliki outlier

Uji Formal Anderson-Darling

library(nortest)

ad.test(data_up$Valuation..M.USD.)
## 
##  Anderson-Darling normality test
## 
## data:  data_up$Valuation..M.USD.
## A = 6.0269, p-value = 7.771e-15

QQ-Plot

Mengecek bentuk sebaran

qqplot(x = qnorm(ppoints(length(data_up$Valuation..M.USD.)), 
                 mean = mean(data_up$Valuation..M.USD.), 
                 sd = sd(data_up$Valuation..M.USD.)),
       y = data_up$Valuation..M.USD., 
       main = "QQ-Plot Valuation")
qqline(data_up$Valuation..M.USD., col = "red", 
       distribution = function(p) qnorm(p, 
                                       mean = mean(data_up$Valuation..M.USD.), 
                                       sd = sd(data_up$Valuation..M.USD.)))

Goodness Fits of Test

Chi-Squared Test

chi_square_test <- function(Valuation..M.USD.) {
  breaks <- hist(data_up$Valuation..M.USD., breaks = 10, plot = FALSE)$breaks
  
  # Observed Frequency
  observed <- hist(data_up$Valuation..M.USD., breaks = breaks, plot = FALSE)$counts
  
  # Expected Frequency (Distribusi Normal)
  expected <- diff(pnorm(breaks, 
                         mean = mean(data_up$Valuation..M.USD.), 
                         sd = sd(data_up$Valuation..M.USD.))) * length(data_up$Valuation..M.USD.)
  
  # Chi-Square Test
  chisq.test(x = observed, p = expected, rescale.p = TRUE)
}

chi_square_test(dataval)
## Warning in chisq.test(x = observed, p = expected, rescale.p = TRUE):
## Chi-squared approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 99.643, df = 8, p-value < 2.2e-16

Mendeteksi Pencilan

detect_outlier <- function(Valuation..M.USD.) {
  IQR_value <- IQR(data_up$Valuation..M.USD.)
  Q1 <- quantile(data_up$Valuation..M.USD., 0.25)
  Q3 <- quantile(data_up$Valuation..M.USD., 0.75)
  batas_bawah <- Q1 - 1.5 * IQR_value
  batas_atas <- Q3 + 1.5 * IQR_value
  outlier <- data_up$Valuation..M.USD.[data_up$Valuation..M.USD. < batas_bawah | data_up$Valuation..M.USD. > batas_atas]
  return(outlier)
}

print(detect_outlier(data_up$Valuation..M.USD.))
## [1] 4357.49

Yang memiliki outlier hanya pada data Penilaian saja, data yang lain tidak memiliki outlier

detect_outlier_index <- function(Valuation..M.USD.) {
  IQR_value <- IQR(data_up$Valuation..M.USD.)
  Q1 <- quantile(data_up$Valuation..M.USD., 0.25)
  Q3 <- quantile(data_up$Valuation..M.USD., 0.75)
  batas_bawah <- Q1 - 1.5 * IQR_value
  batas_atas <- Q3 + 1.5 * IQR_value
  outlier_index <- which(data_up$Valuation..M.USD. < batas_bawah | data_up$Valuation..M.USD. > batas_atas)
  return(outlier_index)
}

print(detect_outlier_index(data_up$Valuation..M.USD.))
## [1] 385

Pengujian Formal

Rosner’s Test

library(EnvStats)
## Warning: package 'EnvStats' was built under R version 4.4.3
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
x = data_up$Valuation..M.USD.
rosnerTest(x, k = 10)$all.stats
##    i   Mean.i     SD.i   Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1  0 1371.809 978.2266 4357.49     385 3.052136   3.863127   FALSE
## 2  1 1365.826 970.0066 4264.90     226 2.988716   3.862597   FALSE
## 3  2 1360.004 962.2168 4137.15     365 2.886195   3.862066   FALSE
## 4  3 1354.417 955.0639 4125.56     109 2.901527   3.861533   FALSE
## 5  4 1348.830 947.8634 4110.36     398 2.913427   3.861000   FALSE
## 6  5 1343.251 940.6355 4110.09      52 2.941457   3.860465   FALSE
## 7  6 1337.650 933.2900 4066.70     399 2.924118   3.859929   FALSE
## 8  7 1332.114 926.0844 4016.20      95 2.898317   3.859392   FALSE
## 9  8 1326.659 919.0628 3774.33     253 2.663225   3.858854   FALSE
## 10 9 1321.674 913.3174 3721.99     361 2.628129   3.858314   FALSE

Statistik Kekar untuk Ukuran Pemusatan

Median

hitung_median <- function(Valuation..M.USD.) {
  median(data_up$Valuation..M.USD., na.rm = TRUE) # na.rm = TRUE untuk mengabaikan NA
}

hitung_median(data_up$Valuation..M.USD.)
## [1] 1222.58

Trimmed Mean

hitung_mean <- function(Valuation..M.USD., trim = 0.1) {
  rataan.biasa <- mean(data_up$Valuation..M.USD., na.rm = TRUE)
  rataan.terpangkas <- mean(data_up$Valuation..M.USD., trim = trim, na.rm = TRUE)
  
  cat("Rataan Biasa       : ", rataan.biasa, "\n")
  cat("Rataan Terpangkas : ", rataan.terpangkas, "\n\n")
}

hitung_mean(data_up$Valuation..M.USD.)
## Rataan Biasa       :  1371.809 
## Rataan Terpangkas :  1286.024

Winsorized Mean

library(datawizard)
## Warning: package 'datawizard' was built under R version 4.4.3
## 
## Attaching package: 'datawizard'
## The following objects are masked from 'package:EnvStats':
## 
##     kurtosis, skewness
winsor_data <- function(data_vector, threshold = 0.1) {
  win.data <- winsorize(data_vector, threshold = threshold, method = "percentile", verbose = FALSE)
  
  cat("Winsorized Mean : ", mean(win.data, na.rm = TRUE), "\n\n")
}

winsor_data(data_up$Valuation..M.USD.)
## Winsorized Mean :  1317.304

Midhinge

dataval <- data_up$Valuation..M.USD.
# Midhinge untuk Valuation
(as.numeric(quantile(dataval, 0.25) + quantile(dataval, 0.75)) / 2)
## [1] 1304.556

Statistik Kekar untuk Ukuran Penyebaran

Jarak Antar Kuartil

IQR(dataval)
## [1] 1495.057

Median Absolute Deviation (MAD)

mad(dataval)
## [1] 1080.252

Trimmed Variance

library(chemometrics)
## Warning: package 'chemometrics' was built under R version 4.4.3
## Loading required package: rpart
cat("Ragam tak terpangkas :", sd(dataval), "\n")
## Ragam tak terpangkas : 978.2266
cat("Ragam terpangkas :", sd_trim(dataval, trim = 0.1))
## Ragam terpangkas : 966.9027

Winsorized Variance

library(datawizard)
win.data2 <- winsorize(dataval, threshold = 0.1, method = "percentile", verbose = FALSE)

cat("Variance biasa :", sd(dataval))
## Variance biasa : 978.2266
cat("\nWinsorized Variance :", sd(win.data2))
## 
## Winsorized Variance : 837.8211