data_up <- read.csv("C:/Users/mhdha/Downloads/startup_data.csv")
tail(data_up)
## Startup.Name Industry Funding.Rounds Funding.Amount..M.USD.
## 495 Startup_495 E-Commerce 4 114.12
## 496 Startup_496 EdTech 2 181.86
## 497 Startup_497 AI 2 107.34
## 498 Startup_498 E-Commerce 1 160.29
## 499 Startup_499 Gaming 5 234.65
## 500 Startup_500 HealthTech 4 211.76
## Valuation..M.USD. Revenue..M.USD. Employees Market.Share.... Profitable
## 495 1503.71 79.19 4014 1.13 1
## 496 2378.65 59.64 3331 0.58 1
## 497 1394.58 10.22 2223 5.85 0
## 498 502.09 84.73 2222 4.32 0
## 499 2814.52 53.16 4972 5.53 0
## 500 2563.17 84.19 2374 5.26 0
## Year.Founded Region Exit.Status
## 495 2015 Europe IPO
## 496 1993 Europe Private
## 497 2019 South America Private
## 498 2019 Australia Private
## 499 2011 Europe Private
## 500 2000 North America Private
summary (data_up)
## Startup.Name Industry Funding.Rounds Funding.Amount..M.USD.
## Length:500 Length:500 Min. :1.000 Min. : 0.57
## Class :character Class :character 1st Qu.:2.000 1st Qu.: 79.21
## Mode :character Mode :character Median :3.000 Median :156.00
## Mean :2.958 Mean :152.66
## 3rd Qu.:4.000 3rd Qu.:226.45
## Max. :5.000 Max. :299.81
## Valuation..M.USD. Revenue..M.USD. Employees Market.Share....
## Min. : 2.43 Min. : 0.12 Min. : 12 Min. : 0.100
## 1st Qu.: 557.03 1st Qu.:22.80 1st Qu.:1383 1st Qu.: 2.760
## Median :1222.58 Median :48.80 Median :2496 Median : 5.135
## Mean :1371.81 Mean :49.32 Mean :2532 Mean : 5.093
## 3rd Qu.:2052.09 3rd Qu.:74.97 3rd Qu.:3709 3rd Qu.: 7.553
## Max. :4357.49 Max. :99.71 Max. :4984 Max. :10.000
## Profitable Year.Founded Region Exit.Status
## Min. :0.000 Min. :1990 Length:500 Length:500
## 1st Qu.:0.000 1st Qu.:1998 Class :character Class :character
## Median :0.000 Median :2006 Mode :character Mode :character
## Mean :0.432 Mean :2006
## 3rd Qu.:1.000 3rd Qu.:2014
## Max. :1.000 Max. :2022
colnames(data_up)
## [1] "Startup.Name" "Industry" "Funding.Rounds"
## [4] "Funding.Amount..M.USD." "Valuation..M.USD." "Revenue..M.USD."
## [7] "Employees" "Market.Share...." "Profitable"
## [10] "Year.Founded" "Region" "Exit.Status"
# Mendefinisikan banyak amatan (n)
n <- 500
# Menghitung k
k1 <- sqrt(n)
k1
## [1] 22.36068
hist(data_up$Funding.Amount..M.USD., breaks=23, col="red",
main="Sebaran Data Total Pendanaan (Funding Amount)")
hist(data_up$Valuation..M.USD., breaks=23, col="coral",
main="Sebaran Data Penilaian")
hist(data_up$Revenue..M.USD., breaks=23, col="orange",
main="Sebaran Data Pendapatan")
hist(data_up$Employees, breaks=23, col="yellow",
main="Sebaran Data Jumlah Pekerja")
hist(data_up$Market.Share...., breaks=23, col="green",
main="Sebaran Data Market Share")
Dari Histogram yang ditunjukkan, hampir semua histogram cenderung menyebar secara seragam, kecuali yang Histogram Penilaian. Histogramnya lebih ke menjulur ke kanan
boxplot(data_up$Funding.Amount..M.USD., breaks=30, col="red")
boxplot(data_up$Valuation..M.USD., col="coral")
boxplot(data_up$Revenue..M.USD., col="orange")
boxplot(data_up$Employees, col="yellow")
boxplot(data_up$Market.Share...., col="green")
Dari Boxplot di atas, hampir semua data tidak memiliki outlier kecuali pada Data Penilaian, itu memiliki outlier
library(nortest)
ad.test(data_up$Valuation..M.USD.)
##
## Anderson-Darling normality test
##
## data: data_up$Valuation..M.USD.
## A = 6.0269, p-value = 7.771e-15
Mengecek bentuk sebaran
qqplot(x = qnorm(ppoints(length(data_up$Valuation..M.USD.)),
mean = mean(data_up$Valuation..M.USD.),
sd = sd(data_up$Valuation..M.USD.)),
y = data_up$Valuation..M.USD.,
main = "QQ-Plot Valuation")
qqline(data_up$Valuation..M.USD., col = "red",
distribution = function(p) qnorm(p,
mean = mean(data_up$Valuation..M.USD.),
sd = sd(data_up$Valuation..M.USD.)))
Chi-Squared Test
chi_square_test <- function(Valuation..M.USD.) {
breaks <- hist(data_up$Valuation..M.USD., breaks = 10, plot = FALSE)$breaks
# Observed Frequency
observed <- hist(data_up$Valuation..M.USD., breaks = breaks, plot = FALSE)$counts
# Expected Frequency (Distribusi Normal)
expected <- diff(pnorm(breaks,
mean = mean(data_up$Valuation..M.USD.),
sd = sd(data_up$Valuation..M.USD.))) * length(data_up$Valuation..M.USD.)
# Chi-Square Test
chisq.test(x = observed, p = expected, rescale.p = TRUE)
}
chi_square_test(dataval)
## Warning in chisq.test(x = observed, p = expected, rescale.p = TRUE):
## Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 99.643, df = 8, p-value < 2.2e-16
detect_outlier <- function(Valuation..M.USD.) {
IQR_value <- IQR(data_up$Valuation..M.USD.)
Q1 <- quantile(data_up$Valuation..M.USD., 0.25)
Q3 <- quantile(data_up$Valuation..M.USD., 0.75)
batas_bawah <- Q1 - 1.5 * IQR_value
batas_atas <- Q3 + 1.5 * IQR_value
outlier <- data_up$Valuation..M.USD.[data_up$Valuation..M.USD. < batas_bawah | data_up$Valuation..M.USD. > batas_atas]
return(outlier)
}
print(detect_outlier(data_up$Valuation..M.USD.))
## [1] 4357.49
Yang memiliki outlier hanya pada data Penilaian saja, data yang lain tidak memiliki outlier
detect_outlier_index <- function(Valuation..M.USD.) {
IQR_value <- IQR(data_up$Valuation..M.USD.)
Q1 <- quantile(data_up$Valuation..M.USD., 0.25)
Q3 <- quantile(data_up$Valuation..M.USD., 0.75)
batas_bawah <- Q1 - 1.5 * IQR_value
batas_atas <- Q3 + 1.5 * IQR_value
outlier_index <- which(data_up$Valuation..M.USD. < batas_bawah | data_up$Valuation..M.USD. > batas_atas)
return(outlier_index)
}
print(detect_outlier_index(data_up$Valuation..M.USD.))
## [1] 385
library(EnvStats)
## Warning: package 'EnvStats' was built under R version 4.4.3
##
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
##
## predict, predict.lm
x = data_up$Valuation..M.USD.
rosnerTest(x, k = 10)$all.stats
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 1371.809 978.2266 4357.49 385 3.052136 3.863127 FALSE
## 2 1 1365.826 970.0066 4264.90 226 2.988716 3.862597 FALSE
## 3 2 1360.004 962.2168 4137.15 365 2.886195 3.862066 FALSE
## 4 3 1354.417 955.0639 4125.56 109 2.901527 3.861533 FALSE
## 5 4 1348.830 947.8634 4110.36 398 2.913427 3.861000 FALSE
## 6 5 1343.251 940.6355 4110.09 52 2.941457 3.860465 FALSE
## 7 6 1337.650 933.2900 4066.70 399 2.924118 3.859929 FALSE
## 8 7 1332.114 926.0844 4016.20 95 2.898317 3.859392 FALSE
## 9 8 1326.659 919.0628 3774.33 253 2.663225 3.858854 FALSE
## 10 9 1321.674 913.3174 3721.99 361 2.628129 3.858314 FALSE
hitung_median <- function(Valuation..M.USD.) {
median(data_up$Valuation..M.USD., na.rm = TRUE) # na.rm = TRUE untuk mengabaikan NA
}
hitung_median(data_up$Valuation..M.USD.)
## [1] 1222.58
hitung_mean <- function(Valuation..M.USD., trim = 0.1) {
rataan.biasa <- mean(data_up$Valuation..M.USD., na.rm = TRUE)
rataan.terpangkas <- mean(data_up$Valuation..M.USD., trim = trim, na.rm = TRUE)
cat("Rataan Biasa : ", rataan.biasa, "\n")
cat("Rataan Terpangkas : ", rataan.terpangkas, "\n\n")
}
hitung_mean(data_up$Valuation..M.USD.)
## Rataan Biasa : 1371.809
## Rataan Terpangkas : 1286.024
library(datawizard)
## Warning: package 'datawizard' was built under R version 4.4.3
##
## Attaching package: 'datawizard'
## The following objects are masked from 'package:EnvStats':
##
## kurtosis, skewness
winsor_data <- function(data_vector, threshold = 0.1) {
win.data <- winsorize(data_vector, threshold = threshold, method = "percentile", verbose = FALSE)
cat("Winsorized Mean : ", mean(win.data, na.rm = TRUE), "\n\n")
}
winsor_data(data_up$Valuation..M.USD.)
## Winsorized Mean : 1317.304
dataval <- data_up$Valuation..M.USD.
# Midhinge untuk Valuation
(as.numeric(quantile(dataval, 0.25) + quantile(dataval, 0.75)) / 2)
## [1] 1304.556
IQR(dataval)
## [1] 1495.057
mad(dataval)
## [1] 1080.252
library(chemometrics)
## Warning: package 'chemometrics' was built under R version 4.4.3
## Loading required package: rpart
cat("Ragam tak terpangkas :", sd(dataval), "\n")
## Ragam tak terpangkas : 978.2266
cat("Ragam terpangkas :", sd_trim(dataval, trim = 0.1))
## Ragam terpangkas : 966.9027
library(datawizard)
win.data2 <- winsorize(dataval, threshold = 0.1, method = "percentile", verbose = FALSE)
cat("Variance biasa :", sd(dataval))
## Variance biasa : 978.2266
cat("\nWinsorized Variance :", sd(win.data2))
##
## Winsorized Variance : 837.8211