#install.packages('openintro')
#source('create_datasets.R')
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
cars <- read.csv("https://assets.datacamp.com/production/course_1796/datasets/cars04.csv")
comics <- read.csv("https://assets.datacamp.com/production/course_1796/datasets/comics.csv")
life <- read.csv("https://assets.datacamp.com/production/course_1796/datasets/life_exp_raw.csv")
#EXPLANATION - biasakan menggunakan link agar tidak perlu mendownload dataset kedalam file direktori laptop atau device
head(comics)
## name id align eye hair
## 1 Spider-Man (Peter Parker) Secret Good Hazel Eyes Brown Hair
## 2 Captain America (Steven Rogers) Public Good Blue Eyes White Hair
## 3 Wolverine (James \\"Logan\\" Howlett) Public Neutral Blue Eyes Black Hair
## 4 Iron Man (Anthony \\"Tony\\" Stark) Public Good Blue Eyes Black Hair
## 5 Thor (Thor Odinson) No Dual Good Blue Eyes Blond Hair
## 6 Benjamin Grimm (Earth-616) Public Good Blue Eyes No Hair
## gender gsm alive appearances first_appear publisher
## 1 Male <NA> Living Characters 4043 Aug-62 marvel
## 2 Male <NA> Living Characters 3360 Mar-41 marvel
## 3 Male <NA> Living Characters 3061 Oct-74 marvel
## 4 Male <NA> Living Characters 2961 Mar-63 marvel
## 5 Male <NA> Living Characters 2258 Nov-50 marvel
## 6 Male <NA> Living Characters 2255 Nov-61 marvel
#EXPLANATION - Menampilkan 6 data teratas dari dalam dataset - Membuktikan bahwa dataset sudah terbaca - menunjukan jumlah baris dan kolom yang ada dalam dataset - menunjukan jenis file dalam dataset
comics_align <- as.factor(comics$align)
levels(comics_align)
## [1] "Bad" "Good" "Neutral"
## [4] "Reformed Criminals"
#EXPLANATION - Menunjukan attribute dalam dataset comics align
comics_gender <- as.factor(comics$gender)
levels(comics_gender)
## [1] "Female" "Male" "Other"
#EXPLANATION - Menunjukan attribute dalam dataset comics gender
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
#EXPLANATION - menampilkan gender dan align dalam bentuk tabel. - bisa dilihat dari tabel diatas bahwa lebih banyak penjahat pria dan lebih sedikit penjahat wanita. - bisa dilihat juga bahwa sekitar lebih banyak wanita yang baik daripada yang jahat berkebalikan dengan pria. - dapat disimpulkan dari tabel diatas ada banyak tokoh yang gendernya other.
# Load dplyr
# Print tab
tab <- table(comics$align, comics$gender)
tab
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
#EXPLANATION - Memasukan table kedalam satu variable
# Remove align level
comics <- comics %>%
filter(align != 'Reformed Criminals') %>%
droplevels()
comics_align <- as.factor(comics$align)
levels(comics_align)
## [1] "Bad" "Good" "Neutral"
#EXPLANATION - Menghapus/Memfilter reformed criminals
# Load ggplot2
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge")
#EXPLANATION - lebih banyak pria di bandingkan dengan wanita - lebih
banyak orang jahat di bandingkan orang baik - dan lebih banyak tokoh NA
atau tidak dikenali dibandingkan other
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(positio = "dodge") +
theme(axis.text.x = element_text(angle = 90))
– Bar chart interpretation Among characters with “Neutral” alignment,
males are the most common. In general, there is an association between
gender and alignment. There are more male characters than female
characters in this dataset. #EXPLANATION - kebalikan dengan code
sebelumnya dimana kita menghitung jumlah align berdasarkan gender
sekarang kita menghitung gender berdasarkan align
# simplify display format
options(scipen = 999, digits = 3)
## create table of counts
tbl_cnt <- table(comics$id, comics$align)
tbl_cnt
##
## Bad Good Neutral
## No Dual 474 647 390
## Public 2172 2930 965
## Secret 4493 2475 959
## Unknown 7 0 2
#EXPLANATION - Membuat tabel berdasarkan jumlah count
# Proportional table
# All values add up to 1
prop.table(tbl_cnt)
##
## Bad Good Neutral
## No Dual 0.030553 0.041704 0.025139
## Public 0.140003 0.188862 0.062202
## Secret 0.289609 0.159533 0.061815
## Unknown 0.000451 0.000000 0.000129
#EXPLANATION - Menggunakan prob.table untuk mengubah tabel count menjadi proporsional - The prop.table() function shows each individual value as a proportion of the whole.
sum(prop.table(tbl_cnt))
## [1] 1
#EXPLANATION - menjumlahkan semua yang ada di dalam tabel sehingga memunculkan angka 1
prop.table(tbl_cnt, 1)
##
## Bad Good Neutral
## No Dual 0.314 0.428 0.258
## Public 0.358 0.483 0.159
## Secret 0.567 0.312 0.121
## Unknown 0.778 0.000 0.222
#EXPLANATION - No dual memiliki 42% di good - Public memiliki 48% di good - Secret memiliki 56% di bad - Unknown memiliki 77% di bad
prop.table(tbl_cnt, 2)
##
## Bad Good Neutral
## No Dual 0.066331 0.106907 0.168394
## Public 0.303946 0.484137 0.416667
## Secret 0.628743 0.408956 0.414076
## Unknown 0.000980 0.000000 0.000864
Look at the proportion of bad characters in the secret and unknown groups Note there are very few characters with id = unknown #EXPLANATION - No Dual memiliki 16% di Neutral - Public memiliki 48% di good - secret memiliki 62% di bad - unknown memiliki 0.09% di bad
ggplot(comics, aes(x = id, fill = align)) +
geom_bar(position = "fill") +
ylab("proportion")
#EXPLANATION - Diperlihatkan dari grafik diatas bahwa align good dan
align Neutral terbanyak ada di No Dual - good paling sedikit ada di
unknown karena tidak terlihat sama sekali pada grafik diatas maka bisa
di simpulkan bahawa good unknown lebih sedikit di bandingkan neutral nya
berkebalikan dengan yang lainnya - neutral paling sedikit berada di
NA
ggplot(comics, aes(x = align, fill = id)) +
geom_bar(position = "fill") +
ylab("proportion")
#EXPLANATION - kebalikan dari grafik sebelumnya dimana sekarang kita
melihat align berdasarkan id - bad terbanyak ada pada id secret - good
terbanyak ada pada public - Neutral paling sedikit dimiliki oleh NA
tab <- table(comics$align, comics$gender)
options(scipen = 999, digits = 3) # Print fewer digits
prop.table(tab) # Joint proportions
##
## Female Male Other
## Bad 0.082210 0.395160 0.001672
## Good 0.130135 0.251333 0.000888
## Neutral 0.043692 0.094021 0.000888
#EXPLANATION - merubah tabel menjadi proporsi - bad memiliki 39% di male - good memiliki 25% di male - Neutral memiliki 9.4% di female
prop.table(tab, 2)
##
## Female Male Other
## Bad 0.321 0.534 0.485
## Good 0.508 0.339 0.258
## Neutral 0.171 0.127 0.258
Approximately what proportion of all female characters are good? - 51% dibulatkan dari 50.8% #EXPLANATION - bad memiliki 53% di male - good memiliki 50% di female - Neutral memiliki 25% di other
# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
#EXPLANATION - pada align bad bisa kita lihat secara keseluruhan baik
female, male, other, dan NA adalah yang terbanyak dibandingkan align
lainnya - NA paling sedikit berada pada Neutral - Neutral memiliki
gender yang sedikit dibandingkan lainnya
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill")
#EXPLANATION - berbeda dengan grafik sebelumnya grafik ini merupakan
hasil dari proporsi - bad memiliki male terbanyak - good memiliki male
paling sedikit - secara bad ataupun neutral gender NA dan other sama
saja
# Can use table function on just one variable
# This is called a marginal distribution
table(comics$id)
##
## No Dual Public Secret Unknown
## 1511 6067 7927 9
#EXPLANATION
# Simple barchart
ggplot(comics, aes(x = id)) +
geom_bar()
- You can also facet to see variables indidually - A little easier than
filtering each and plotting. - This is a rearrangement of the bar chart
we plotted earlier - We facte by alignment rather then coloring the
stack. - This can make it a little easier to answer some questions.
#EXPLANATION - secret merupakan id terbanyak berdasarkan count - count paling sedikit dimiliki oleh unknown - kurang lebih rata rata perbedaan count setiap id adalah 2000
ggplot(comics, aes(x = id)) +
geom_bar() +
facet_wrap(~align)
Marginal barchart - It makes more sense to put neutral between Bad and
Good - We need to reorder the levels so it will chart this way -
Otherwise it will defult to alphabetical
#EXPLANATION
# Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c("Bad", "Neutral", "Good"))
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
#EXPLANATION
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
#EXPLANATION
# Put levels of flavor in decending order
pies <- data.frame(flavors = as.factor(rep(c("apple", "blueberry", "boston creme", "cherry", "key lime", "pumpkin", "strawberry"), times = c(17, 14, 15, 13, 16, 12, 11))))
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)
head(pies$flavor)
## [1] apple apple apple apple apple apple
## Levels: apple key lime boston creme blueberry cherry pumpkin strawberry
#EXPLANATION
# Create barchart of flavor
ggplot(pies, aes(x = flavor)) +
geom_bar(fill = "chartreuse") +
theme(axis.text.x = element_text(angle = 90))
#EXPLANATION
# A dot plot shows all the datapoints
ggplot(cars, aes(x = weight)) +
geom_dotplot(dotsize = 0.4)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bindot).
#EXPLANATION
# A histogram groups the points into bins so it does not get overwhelming
ggplot(cars, aes(x = weight)) +
geom_histogram(dotsize = 0.4, binwidth = 500)
## Warning: Ignoring unknown parameters: dotsize
## Warning: Removed 2 rows containing non-finite values (stat_bin).
#EXPLANATION
# A density plot gives a bigger picture representation of the distribution
# It more helpful when there is a lot of data
ggplot(cars, aes(x = weight)) +
geom_density()
## Warning: Removed 2 rows containing non-finite values (stat_density).
#EXPLANATION
# A boxplot is a good way to just show the summary info of the distriubtion
ggplot(cars, aes(x = 1, y = weight)) +
geom_boxplot() +
coord_flip()
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
#EXPLANATION
# Load package
library(ggplot2)
# Learn data structure
str(cars)
## 'data.frame': 428 obs. of 19 variables:
## $ name : chr "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
## $ sports_car : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ wagon : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ minivan : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pickup : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ all_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ rear_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ dealer_cost: int 10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
## $ eng_size : num 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr : int 103 103 140 140 140 132 132 130 110 130 ...
## $ city_mpg : int 28 28 26 26 26 29 29 26 27 26 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
## $ weight : int 2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
## $ wheel_base : int 98 98 104 104 104 105 105 103 103 103 ...
## $ length : int 167 153 183 183 183 174 174 168 168 168 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
#EXPLANATION
# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_wrap(~ suv)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).
#EXPLANATION
unique(cars$ncyl)
## [1] 4 6 3 8 5 12 10 -1
#EXPLANATION
table(cars$ncyl)
##
## -1 3 4 5 6 8 10 12
## 2 1 136 7 190 87 2 3
#EXPLANATION
# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars, ncyl %in% c(4,6,8))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
#EXPLANATION
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).
– Compare distribution via plots The highest mileage cars have 4
cylinders. The typical 4 cylinder car gets better mileage than the
typical 6 cylinder car, which gets better mileage than the typical 8
cylinder car. Most of the 4 cylinder cars get better mileage than even
the most efficient 8 cylinder cars. #EXPLANATION
# Create hist of horsepwr
cars %>%
ggplot(aes(horsepwr)) +
geom_histogram() +
ggtitle("Horsepower distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#EXPLANATION
# Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle("Horsepower distribtion for msrp < 25000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
#EXPLANATION
# Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle("binwidth = 3")
# Create hist of horsepwr with binwidth of 30
cars %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle("binwidth = 30")
#EXPLANATION
# Create hist of horsepwr with binwidth of 60
cars %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle("binwidth = 60")
#EXPLANATION
# Construct box plot of msrp
cars %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
#EXPLANATION
# Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
# Construct box plot of msrp using the reduced dataset
cars_no_out %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Create plot of city_mpg
cars %>%
ggplot(aes(x = 1, y = city_mpg)) +
geom_boxplot()
## Warning: Removed 14 rows containing non-finite values (stat_boxplot).
#EXPLANATION
cars %>%
ggplot(aes(city_mpg)) +
geom_density()
## Warning: Removed 14 rows containing non-finite values (stat_density).
#EXPLANATION
# Create plot of width
cars %>%
ggplot(aes(x = 1, y = width)) +
geom_boxplot()
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
#EXPLANATION
cars %>%
ggplot(aes(x = width)) +
geom_density()
## Warning: Removed 28 rows containing non-finite values (stat_density).
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle("hwy_mpg by ncyl and suv")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
– Interpret 3 var plot Across both SUVs and non-SUVs, mileage tends to
decrease as the number of cylinders increases. #EXPLANATION
head(life)
## State County fips Year Female.life.expectancy..years.
## 1 Alabama Autauga County 1001 1985 77.0
## 2 Alabama Baldwin County 1003 1985 78.8
## 3 Alabama Barbour County 1005 1985 76.0
## 4 Alabama Bibb County 1007 1985 76.6
## 5 Alabama Blount County 1009 1985 78.9
## 6 Alabama Bullock County 1011 1985 75.1
## Female.life.expectancy..national..years.
## 1 77.8
## 2 77.8
## 3 77.8
## 4 77.8
## 5 77.8
## 6 77.8
## Female.life.expectancy..state..years. Male.life.expectancy..years.
## 1 76.9 68.1
## 2 76.9 71.1
## 3 76.9 66.8
## 4 76.9 67.3
## 5 76.9 70.6
## 6 76.9 66.6
## Male.life.expectancy..national..years. Male.life.expectancy..state..years.
## 1 70.8 69.1
## 2 70.8 69.1
## 3 70.8 69.1
## 4 70.8 69.1
## 5 70.8 69.1
## 6 70.8 69.1
#EXPLANATION
x <- head(round(life$Female.life.expectancy..years.), 11)
x
## [1] 77 79 76 77 79 75 77 77 77 78 77
mean
-balance point of the data -sensitive to extreme values #EXPLANATION
sum(x)/11
## [1] 77.2
mean(x)
## [1] 77.2
median
sort(x)
## [1] 75 76 77 77 77 77 77 77 78 79 79
#EXPLANATION
median(x)
## [1] 77
mode
table(x)
## x
## 75 76 77 78 79
## 1 1 6 1 2
#install.packages('gapminder')
library(gapminder)
str(gapminder)
## tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
#EXPLANATION
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp))
## # A tibble: 5 x 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fct> <dbl> <dbl>
## 1 Africa 54.8 52.9
## 2 Americas 73.6 72.9
## 3 Asia 70.7 72.4
## 4 Europe 77.6 78.6
## 5 Oceania 80.7 80.7
#EXPLANATION
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
Measures of variability -We wnat to know ‘How much is the data spread
out from the middle?’ -Just looking at the data gives us a sense of this
1.But we want break it down to one number so we can compare sample
distributions #EXPLANATION
x
## [1] 77 79 76 77 79 75 77 77 77 78 77
We could just take the differnce between all points and the mean and add it up - But that would equal 0. Thats the idea of the mean.
# Look at the difference between each point and the mean
sum(x - mean(x))
## [1] -0.0000000000000568
So we can square the differnce - But this number will keep getting bigger as you add more observations - We want something that is stable #EXPLANATION
# Square each difference to get rid of negatives then sum
sum((x - mean(x))^2)
## [1] 13.6
Variance
sum((x - mean(x))^2)/(length(x) - 1)
## [1] 1.36
#EXPLANATION
var(x)
## [1] 1.36
Standard Deviation
sqrt(sum((x - mean(x))^2)/(length(x) - 1))
## [1] 1.17
#EXPLANATION
sd(x)
## [1] 1.17
Inter Quartile Range
#EXPLANATION
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 75.0 77.0 77.0 77.2 77.5 79.0
#EXPLANATION
IQR(x)
## [1] 0.5
Range
#EXPLANATION
max(x)
## [1] 79
#EXPLANATION
min(x)
## [1] 75
#EXPLANATION
diff(range(x))
## [1] 4
#EXPLANATION
str(gap2007)
## tibble [142 x 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 4 1 1 2 5 4 3 3 4 ...
## $ year : int [1:142] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## $ lifeExp : num [1:142] 43.8 76.4 72.3 42.7 75.3 ...
## $ pop : int [1:142] 31889923 3600523 33333216 12420476 40301927 20434176 8199783 708573 150448339 10392226 ...
## $ gdpPercap: num [1:142] 975 5937 6223 4797 12779 ...
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n())
## # A tibble: 5 x 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fct> <dbl> <dbl> <int>
## 1 Africa 9.63 11.6 52
## 2 Americas 4.44 4.63 25
## 3 Asia 7.96 10.2 33
## 4 Europe 2.98 4.78 30
## 5 Oceania 0.729 0.516 2
#EXPLANATION
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
# Compute stats for lifeExp in Americas
head(gap2007)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Albania Europe 2007 76.4 3600523 5937.
## 3 Algeria Africa 2007 72.3 33333216 6223.
## 4 Angola Africa 2007 42.7 12420476 4797.
## 5 Argentina Americas 2007 75.3 40301927 12779.
## 6 Australia Oceania 2007 81.2 20434176 34435.
#EXPLANATION
gap2007 %>%
filter(continent == "Americas") %>%
summarize(mean(lifeExp),
sd(lifeExp))
## # A tibble: 1 x 2
## `mean(lifeExp)` `sd(lifeExp)`
## <dbl> <dbl>
## 1 73.6 4.44
#EXPLANATION
# Compute stats for population
gap2007 %>%
summarize(median(pop),
IQR(pop))
## # A tibble: 1 x 2
## `median(pop)` `IQR(pop)`
## <dbl> <dbl>
## 1 10517531 26702008.
Shape and transformations
4 chracteristics of a distribution that are of interest:
– Describe the shape
A: unimodal, left-skewed B: unimodal, symmetric C: unimodal, right-skewed D: bimodal, symmetric
#EXPLANATION
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log(pop))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
#EXPLANATION
# Filter for Asia, add column indicating outliers
str(gapminder)
## tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
gap_asia <- gap2007 %>%
filter(continent == "Asia") %>%
mutate(is_outlier = lifeExp < 50)
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = 1, y = lifeExp)) +
geom_boxplot()
#EXPLANATION
# ggplot2, dplyr, and openintro are loaded
# Compute summary statistics
email %>%
group_by(spam) %>%
summarize(
median(num_char),
IQR(num_char))
## # A tibble: 2 x 3
## spam `median(num_char)` `IQR(num_char)`
## <fct> <dbl> <dbl>
## 1 0 6.83 13.6
## 2 1 1.05 2.82
#EXPLANATION
str(email)
## tibble [3,921 x 21] (S3: tbl_df/tbl/data.frame)
## $ spam : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ to_multiple : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...
## $ from : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ cc : int [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ sent_email : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...
## $ time : POSIXct[1:3921], format: "2012-01-01 13:16:41" "2012-01-01 14:03:59" ...
## $ image : num [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ attach : num [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ dollar : num [1:3921] 0 0 4 0 0 0 0 0 0 0 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num [1:3921] 0 0 1 0 0 0 0 0 0 0 ...
## $ viagra : num [1:3921] 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num [1:3921] 0 0 0 0 2 2 0 0 0 0 ...
## $ num_char : num [1:3921] 11.37 10.5 7.77 13.26 1.23 ...
## $ line_breaks : int [1:3921] 202 202 192 255 29 25 193 237 69 68 ...
## $ format : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 1 2 ...
## $ re_subj : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ exclaim_subj: num [1:3921] 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent_subj : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ exclaim_mess: num [1:3921] 0 1 6 48 1 1 1 18 1 0 ...
## $ number : Factor w/ 3 levels "none","small",..: 3 2 2 2 1 1 3 2 2 2 ...
#EXPLANATION
table(email$spam)
##
## 0 1
## 3554 367
email <- email %>%
mutate(spam = factor(ifelse(spam == 0, "not-spam", "spam")))
# Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = spam, y = log_num_char)) +
geom_boxplot()
– Spam and num_char interpretation - The median length of not-spam
emails is greater than that of spam emails
#EXPLANATION
# Compute center and spread for exclaim_mess by spam
email %>%
group_by(spam) %>%
summarize(
median(exclaim_mess),
IQR(exclaim_mess))
## # A tibble: 2 x 3
## spam `median(exclaim_mess)` `IQR(exclaim_mess)`
## <fct> <dbl> <dbl>
## 1 not-spam 1 5
## 2 spam 0 1
table(email$exclaim_mess)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1435 733 507 128 190 113 115 51 93 45 85 17 56 20 43 11
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 29 12 26 5 29 9 15 3 11 6 11 1 6 8 13 12
## 32 33 34 35 36 38 39 40 41 42 43 44 45 46 47 48
## 13 3 3 2 3 3 1 2 1 1 3 3 5 3 2 1
## 49 52 54 55 57 58 62 71 75 78 89 94 96 139 148 157
## 3 1 1 4 2 2 2 1 1 1 1 1 1 1 1 1
## 187 454 915 939 947 1197 1203 1209 1236
## 1 1 1 1 1 1 2 1 1
#EXPLANATION
# Create plot for spam and exclaim_mess
email %>%
mutate(log_exclaim_mess = log(exclaim_mess)) %>%
ggplot(aes(x = log_exclaim_mess)) +
geom_histogram() +
facet_wrap(~ spam)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1435 rows containing non-finite values (stat_bin).
– Spam and !!! interpretation - The most common value of exclaim_mess in
both classes of email is zero (a log(exclaim_mess) of -4.6 after adding
.01). - Even after a transformation, the distribution of exclaim_mess in
both classes of email is right-skewed. - The typical number of
exclamations in the not-spam group appears to be slightly higher than in
the spam group.
Check-in 1
- Zero inflation in the exclaim_mess variable
-you can analyze the two part separatly
-or turn it into a categorical variable of is-zero, not-zero
- Could make a barchart
-need to decide if you are more interested in counts or proportions
#EXPLANATION
table(email$image)
##
## 0 1 2 3 4 5 9 20
## 3811 76 17 11 2 2 1 1
#EXPLANATION
# Create plot of proportion of spam by image
email %>%
mutate(has_image = image > 0) %>%
ggplot(aes(x = has_image, fill = spam)) +
geom_bar(position = "fill")
– Image and spam interpretation -An email without an image is more
likely to be not-spam than spam
# Test if images count as attachments
sum(email$image > email$attach)
## [1] 0
There are no emails with more images than attachments so these most be counted as attachments also #EXPLANATION
## Within non-spam emails, is the typical length of emails shorter for
## those that were sent to multiple people?
email %>%
filter(spam == "not-spam") %>%
group_by(to_multiple) %>%
summarize(median(num_char))
## # A tibble: 2 x 2
## to_multiple `median(num_char)`
## <fct> <dbl>
## 1 0 7.20
## 2 1 5.36
Yes, karena
# Question 1
## For emails containing the word "dollar", does the typical spam email
## contain a greater number of occurences of the word than the typical non-spam email?
table(email$dollar)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3175 120 151 10 146 20 44 12 35 10 22 10 20 7 14 5
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 32
## 23 2 14 1 10 7 12 7 7 3 7 1 5 1 1 2
## 34 36 40 44 46 48 54 63 64
## 1 2 3 3 2 1 1 1 3
#EXPLANATION
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(median(dollar))
## # A tibble: 2 x 2
## spam `median(dollar)`
## <fct> <dbl>
## 1 not-spam 4
## 2 spam 2
NO, karena berdasarkan tabel di atas not spam nya adalah 4 dan spam nya adalah 2
#EXPLANATION
# Question 2
## If you encounter an email with greater than 10 occurrences of the word "dollar",
## is it more likely to be spam or not -spam?
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = spam)) +
geom_bar()
Jawaban: tidak spam, berdasarkan dari grafik di atas dapat kita
simpulkan bahwa email spam kurang dari 10 ada banyak sekali dan sedikit
spam yang terhitung karena rata- rata email yang berkata dollar < 10
#EXPLANATION
levels(email$number)
## [1] "none" "small" "big"
table(email$number)
##
## none small big
## 549 2827 545
#EXPLANATION
# Reorder levels
email$number <- factor(email$number, levels = c("none","small","big"))
# Construct plot of number
ggplot(email, aes(x = number)) +
geom_bar() +
facet_wrap( ~ spam)
What’s in a number interpretation -Given that an email contains a small
number, it is more likely to be not-spam. - Given that an email contains
a big number, it is more likely to be not-spam. - Within both spam and
not-spam, the most common number is a small one.
#EXPLANATION