First set up your libraries.
setwd("C:/Users/tomas/Downloads/eda-course-materials")
library(dslabs)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
To open Tab Separated Values file (.csv) use read_tsv.
pf<-read_tsv("C://Users//tomas//Downloads//eda-course-materials//lesson3//pseudo_facebook.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## userid = col_double(),
## age = col_double(),
## dob_day = col_double(),
## dob_year = col_double(),
## dob_month = col_double(),
## gender = col_character(),
## tenure = col_double(),
## friend_count = col_double(),
## friendships_initiated = col_double(),
## likes = col_double(),
## likes_received = col_double(),
## mobile_likes = col_double(),
## mobile_likes_received = col_double(),
## www_likes = col_double(),
## www_likes_received = col_double()
## )
names(pf)
## [1] "userid" "age" "dob_day"
## [4] "dob_year" "dob_month" "gender"
## [7] "tenure" "friend_count" "friendships_initiated"
## [10] "likes" "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes" "www_likes_received"
Histograms are created with qplot(data=DATASET, x=VARIABLE).
To personalize the divisions on the x axis use + scale_x_continuous(breaks=1:x).
In order to divide your variable into multiple histograms along a splitting variable, use + facet_wrap(~SPLITTING VARIABLE, ncol=#OFCOLUMNS_optional_.
qplot(data=pf, x=dob_day) +
scale_x_continuous(breaks=1:31) +
facet_wrap(~dob_month, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
To remove outliers and make the plot easer to read, you can limit the axis with + scale_y_continuous(limits=c(0,MAXIMUM)).
You can also use qplot(data=DATASET, x=VARIABLE, ylim = c(0,MAXIMUM)).
To setup the width of the bars (bin), use binwidth=X, and the breaks of the x axis can be personalized with scale_x_continuous(breaks=seq(START, END, STEPS)).
qplot(data=pf, x=dob_day, binwidth=1) +
scale_x_continuous(breaks=1:31) +
facet_wrap(~dob_month, ncol=3) +
scale_y_continuous(limits = c(0,750), breaks = seq(0,750, 50))
## Warning: Removed 1 rows containing missing values (geom_bar).
In order to look at a table by a specific values, we can use the by(VARIABLE 1, VARIABLE 2, SUMMARY).
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## ------------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
To add color and labels to a histogram, add color=I(“COLOR”) for the outline, fill=I(“COLOR”) for the inside, and xlab=“LABEl, ylab=”LABEL" for the labels.
qplot(data=pf, x=tenure/365, binwidth=0.10,
color=I("black"), fill=I("lightblue"),
xlab = "Years using Facebook",
ylab = "Number of users") +
scale_x_continuous(breaks = seq(0, 7, 0.5), limits = c(0, 7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
To transform the scale of your axis, use the tranform (trans) addition to the + scale_x_continuous(), or a new layer called scale_x_TRANSFORMATION().
To view multiple plots at once, use the library(gridExtra) funtion arrange(PLOT1, PLOT2, PLOT3, ncol=#OF COLUMNS).
You have to create new objects for each plot first.
Friend_Count <- qplot(data=pf, x=friend_count, binwidth=100,
color = I("black"), fill=I("yellow"),
xlab = "Number of friends",
ylab = "Users")
Friend_Count_log10 <- qplot(data=pf, x=friend_count, binwidth=0.05,
color = I("black"), fill=I("blue"),
xlab = "Number of friends (log 10)",
ylab = "Users") +
scale_x_continuous(trans = "log10")
Friend_Count_sqrt <- qplot(data=pf, x=friend_count, binwidth=1,
color = I("black"), fill=I("red"),
xlab = "Number of friends (Square root)",
ylab = "Users") +
scale_x_continuous(trans = "sqrt", breaks = seq(0,5000,500))
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(Friend_Count, Friend_Count_log10, Friend_Count_sqrt, ncol=1)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
Frequency polygons can display several variables in the same plot, to create a frequency polygon we add geom=“freqpoly”.
We can discriminate by gender wit color=gender.
We subset the data to eliminate users with no gender (NA). subset(DATASET, function(VARIABLE)).
The ! operator means NOT.
qplot(data= subset(pf, !is.na(gender)),
x=friend_count, y=(..count../sum(..count..))*1000,
geom="freqpoly", binwidth=5,
color=gender,
xlab = "Number of Friends",
ylab = "Users") +
scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,100))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 row(s) containing missing values (geom_path).
To create a boxplot, we use the geom=“boxplot” addition to qplot.
The y axis should be the larger variable and the x axis the factor.
The balck points are the outliers.
instead of scale_y_continuous(breaks=c(0,x)) we use coord_cartesian(ylim = c(0,x)) to keep all of the data points.
qplot(data= subset(pf, !is.na(gender)),
x=gender, y=friend_count,
geom = "boxplot",
fill=gender,
xlab = "Gender",
ylab = "Number of friends") +
coord_cartesian(ylim = c(0,700))