Exploring one variable

Loading .tsv

First set up your libraries.

setwd("C:/Users/tomas/Downloads/eda-course-materials")
library(dslabs)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

To open Tab Separated Values file (.csv) use read_tsv.

pf<-read_tsv("C://Users//tomas//Downloads//eda-course-materials//lesson3//pseudo_facebook.tsv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   userid = col_double(),
##   age = col_double(),
##   dob_day = col_double(),
##   dob_year = col_double(),
##   dob_month = col_double(),
##   gender = col_character(),
##   tenure = col_double(),
##   friend_count = col_double(),
##   friendships_initiated = col_double(),
##   likes = col_double(),
##   likes_received = col_double(),
##   mobile_likes = col_double(),
##   mobile_likes_received = col_double(),
##   www_likes = col_double(),
##   www_likes_received = col_double()
## )
names(pf)
##  [1] "userid"                "age"                   "dob_day"              
##  [4] "dob_year"              "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"          "friendships_initiated"
## [10] "likes"                 "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"             "www_likes_received"

Histograms

Histograms are created with qplot(data=DATASET, x=VARIABLE).
To personalize the divisions on the x axis use + scale_x_continuous(breaks=1:x).
In order to divide your variable into multiple histograms along a splitting variable, use + facet_wrap(~SPLITTING VARIABLE, ncol=#OFCOLUMNS_optional_.

qplot(data=pf, x=dob_day) +
 scale_x_continuous(breaks=1:31) +
 facet_wrap(~dob_month, ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

To remove outliers and make the plot easer to read, you can limit the axis with + scale_y_continuous(limits=c(0,MAXIMUM)).
You can also use qplot(data=DATASET, x=VARIABLE, ylim = c(0,MAXIMUM)).
To setup the width of the bars (bin), use binwidth=X, and the breaks of the x axis can be personalized with scale_x_continuous(breaks=seq(START, END, STEPS)).

qplot(data=pf, x=dob_day, binwidth=1) +
 scale_x_continuous(breaks=1:31) +
 facet_wrap(~dob_month, ncol=3) +
  scale_y_continuous(limits = c(0,750), breaks = seq(0,750, 50))
## Warning: Removed 1 rows containing missing values (geom_bar).

In order to look at a table by a specific values, we can use the by(VARIABLE 1, VARIABLE 2, SUMMARY).

by(pf$friend_count, pf$gender, summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## ------------------------------------------------------------ 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

To add color and labels to a histogram, add color=I(“COLOR”) for the outline, fill=I(“COLOR”) for the inside, and xlab=“LABEl, ylab=”LABEL" for the labels.

qplot(data=pf, x=tenure/365, binwidth=0.10,
      color=I("black"), fill=I("lightblue"),
      xlab = "Years using Facebook",
      ylab = "Number of users") +
  scale_x_continuous(breaks = seq(0, 7, 0.5), limits = c(0, 7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

To transform the scale of your axis, use the tranform (trans) addition to the + scale_x_continuous(), or a new layer called scale_x_TRANSFORMATION().
To view multiple plots at once, use the library(gridExtra) funtion arrange(PLOT1, PLOT2, PLOT3, ncol=#OF COLUMNS).
You have to create new objects for each plot first.

Friend_Count <- qplot(data=pf, x=friend_count, binwidth=100,
                      color = I("black"), fill=I("yellow"),
                      xlab = "Number of friends",
                      ylab = "Users")

Friend_Count_log10 <- qplot(data=pf, x=friend_count, binwidth=0.05,
                      color = I("black"), fill=I("blue"),
                      xlab = "Number of friends (log 10)",
                      ylab = "Users") +
  scale_x_continuous(trans = "log10")

Friend_Count_sqrt <- qplot(data=pf, x=friend_count, binwidth=1,
                      color = I("black"), fill=I("red"),
                      xlab = "Number of friends (Square root)",
                      ylab = "Users") +
  scale_x_continuous(trans = "sqrt", breaks = seq(0,5000,500))

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(Friend_Count, Friend_Count_log10, Friend_Count_sqrt, ncol=1)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1962 rows containing non-finite values (stat_bin).

Frequency polygons

Frequency polygons can display several variables in the same plot, to create a frequency polygon we add geom=“freqpoly”.
We can discriminate by gender wit color=gender.
We subset the data to eliminate users with no gender (NA). subset(DATASET, function(VARIABLE)).
The ! operator means NOT.

qplot(data= subset(pf, !is.na(gender)), 
      x=friend_count, y=(..count../sum(..count..))*1000,
      geom="freqpoly", binwidth=5,
      color=gender,
      xlab = "Number of Friends",
      ylab = "Users") +
  scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,100))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 row(s) containing missing values (geom_path).

Boxplots

To create a boxplot, we use the geom=“boxplot” addition to qplot.
The y axis should be the larger variable and the x axis the factor.
The balck points are the outliers.
instead of scale_y_continuous(breaks=c(0,x)) we use coord_cartesian(ylim = c(0,x)) to keep all of the data points.

qplot(data= subset(pf, !is.na(gender)), 
      x=gender, y=friend_count,
      geom = "boxplot", 
      fill=gender,
      xlab = "Gender",
      ylab = "Number of friends") +
  coord_cartesian(ylim = c(0,700))