All<- tidyverse::tidyverse_packages()

cli::boxx("Hello there! Let's Explore Tidyverse Package", padding = 1, float = "center")

## +--------------------------------------------------+
## |                                                  |
## |   Hello there! Let's Explore Tidyverse Package   |
## |                                                  |
## +--------------------------------------------------+

print(All)

##  [1] "broom"       "cli"         "crayon"      "dplyr"       "dbplyr"     
##  [6] "forcats"     "ggplot2"     "haven"       "hms"         "httr"       
## [11] "jsonlite"    "lubridate"   "magrittr"    "modelr"      "purrr"      
## [16] "readr"       "readxl\n(>=" "reprex"      "rlang"       "rstudioapi" 
## [21] "rvest"       "stringr"     "tibble"      "tidyr"       "xml2"       
## [26] "tidyverse"

Data Source

from https://fivethirtyeight.com/features/the-worst-tweeter-in-politics-isnt-trump/

https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv

readr : Read File

There are many types of file and with readr package you can read your data directly in R using readr’r method.

read_csv() and read_tsv() are special cases of the general read_delim(). They’re useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively. read_csv2() uses ; for the field separator and , for the decimal point. This is common in some European countries.

Read_CSV

obama <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv"

trump <- "https://github.com/fivethirtyeight/data/raw/master/twitter-ratio/realDonaldTrump.csv"

senators<- "https://github.com/fivethirtyeight/data/raw/master/twitter-ratio/senators.csv"


# Read in the dataset, we can user read_csv to read data from online or from local disk
D_obama <- read_csv(obama)

## Parsed with column specification:
## cols(
##   created_at = col_character(),
##   text = col_character(),
##   url = col_character(),
##   replies = col_double(),
##   retweets = col_double(),
##   favorites = col_double(),
##   user = col_character()
## )

D_trump <- read_csv(trump)

## Parsed with column specification:
## cols(
##   created_at = col_character(),
##   text = col_character(),
##   url = col_character(),
##   replies = col_double(),
##   retweets = col_double(),
##   favorites = col_double(),
##   user = col_character()
## )

D_senators <- read_csv(senators)

## Parsed with column specification:
## cols(
##   created_at = col_character(),
##   text = col_character(),
##   url = col_character(),
##   replies = col_double(),
##   retweets = col_double(),
##   favorites = col_double(),
##   user = col_character(),
##   bioguide_id = col_character(),
##   party = col_character(),
##   state = col_character()
## )

#I'll Work on only D_trump

# Let's take a look at what we have, We can use Glimse to see data with its data strcutre 
str(D_trump)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3232 obs. of  7 variables:
##  $ created_at: chr  "10/23/17 12:30" "10/23/17 11:53" "10/23/17 11:42" "10/22/17 12:08" ...
##  $ text      : chr  "I had a very respectful conversation with the widow of Sgt. La David Johnson, and spoke his name from beginning"| __truncated__ "Two dozen NFL players continue to kneel during the National Anthem, showing total disrespect to our Flag &amp; "| __truncated__ "There will be NO change to your 401(k). This has always been a great and popular middle class tax break that wo"| __truncated__ "It is finally sinking through. 46% OF PEOPLE BELIEVE MAJOR NATIONAL NEWS ORGS FABRICATE STORIES ABOUT ME. FAKE "| __truncated__ ...
##  $ url       : chr  "https://twitter.com/realDonaldTrump/status/922440008971292672" "https://twitter.com/realDonaldTrump/status/922430688703451136" "https://twitter.com/realDonaldTrump/status/922428118685581313" "https://twitter.com/realDonaldTrump/status/922072236592435200" ...
##  $ replies   : num  46228 31419 9552 56238 32136 ...
##  $ retweets  : num  10243 14006 13719 25102 21573 ...
##  $ favorites : num  49468 62406 62662 112890 97145 ...
##  $ user      : chr  "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   created_at = col_character(),
##   ..   text = col_character(),
##   ..   url = col_character(),
##   ..   replies = col_double(),
##   ..   retweets = col_double(),
##   ..   favorites = col_double(),
##   ..   user = col_character()
##   .. )

head(D_trump)

Set Column Name

#Lets add our Custom Column Name, there are many ways we can do it. 
# We can change the name after reading the data set. Or We can read data and apply our name as we read it.
# It looks like we need to skip five lines, which will remove the column names
# So lets create a vector with column names
names <- c("TweetDate", "Tweets", "TweetURL", "Count_Reply", "Count_Retweet", 
           "Count_Fav", "UserID")

# And then try reading the file again
Dn_trump <- read_csv(trump,skip=1, col_names = names)

## Parsed with column specification:
## cols(
##   TweetDate = col_character(),
##   Tweets = col_character(),
##   TweetURL = col_character(),
##   Count_Reply = col_double(),
##   Count_Retweet = col_double(),
##   Count_Fav = col_double(),
##   UserID = col_character()
## )

head(Dn_trump)

Read column by Type

Lets say you want to force some column datatype before you read it, You can set type of column using col_types before you read in read_csv, i.e. c = character, i = integer, n = number, d = double, l = logical, f = factor, D = date, T = date time, t = time, ? = guess, or _/- to skip the column.

# Lets Try altername 
names(D_trump) <- names

# Read in given Type of data
Type <- 'cccdddc'
read_csv(trump,col_types = Type )

#Chek data
head(D_trump)

Subset

# Lets say I want to Read Only 1st 2 Row of record
D_trump[1:2,]

#Read 1st TWO colum of data
D_trump[,1:2]

# Read 11 Row with 2nd and 3rd column only.
D_trump[11,2:3]

#Create a subset of Data with Count_Fav greater thsn 100k (100000)

D_trump_Fav_100k <- filter(D_trump,D_trump$Count_Fav > 100000)

head(D_trump_Fav_100k)

lubridate : Working with date

Want to read day, month, date, weekday, hours etc information from date time value. Lubridate gives handy function to do it: ### Read Date

#Lets Work With Date
# Read 1 Record of date 
D_trump$TweetDate[1]

## [1] "10/23/17 12:30"

#Lets Split Date and Time 
date_string <- str_split(D_trump$TweetDate[1]," ",n=2,simplify = TRUE)
date_string

##      [,1]       [,2]   
## [1,] "10/23/17" "12:30"

data_date <- date_string[1,1]
data_date

## [1] "10/23/17"

data_time <- date_string[1,2]
data_time

## [1] "12:30"

new_date <- paste0(D_trump$TweetDate[1],":00")
new_date

## [1] "10/23/17 12:30:00"

lub_date <- mdy_hms(new_date)
lub_date

## [1] "2017-10-23 12:30:00 UTC"

# set Time Zone of the Date
tz(lub_date) <- "America/New_York"

Month

#TO get Month 
month(lub_date) # Will Return only Number

## [1] 10

month(lub_date,label = TRUE) # Will return Name of the month.

## [1] Oct
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec

Days

day(lub_date) # Which Days in MOnth

## [1] 23

days_in_month(lub_date) # How many months are in this Month

## Oct 
##  31

Weekday

# We can also extract some derived values such as the weekday
wday(lub_date)

## [1] 2

wday(lub_date, label = TRUE)

## [1] Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

Day of the year

# or day of the year
yday(lub_date)

## [1] 296

Change Timezone

# Changes time
with_tz(lub_date)

## [1] "2017-10-23 12:30:00 EDT"

with_tz(lub_date, "America/Chicago")

## [1] "2017-10-23 11:30:00 CDT"

with_tz(lub_date, "America/Los_Angeles")

## [1] "2017-10-23 09:30:00 PDT"

Current Time

now()

## [1] "2019-04-20 00:38:57 EDT"

tz(now())

## [1] ""

Date in Interval

%within%

Tests Whether A Date Or Interval Falls Within An Interval recycled according to standard R rules. If b is a list of intervals, a is checked if it falls within any of the intervals in b. If a is an interval, both its start and end dates must fall within b to return TRUE.

## recycling
dates <- ymd(c("2014-12-20", "2014-12-30", "2015-01-01", "2015-01-03"))
blackouts<- c(interval(ymd("2014-12-30"), ymd("2014-12-31")),
              interval(ymd("2014-12-30"), ymd("2015-01-03")))
dates %within% blackouts

## [1] FALSE  TRUE FALSE  TRUE

# lets Put all the date vlaues together 
D_trump$newDate <- mdy_hms(paste0(D_trump$TweetDate,":00"))
  tz(D_trump$newDate)<- "America/New_York"
 D_trump$month <- month(D_trump$newDate ,label = TRUE)
 D_trump$wday <- wday(D_trump$newDate, label = TRUE)

ggplot2 package

Subset of Data

# create a subset of data with only valid data value 
 newTestDate <- D_trump[,c("newDate","month","wday","Count_Retweet","Count_Reply")]

Bar graph

# create the Base ggplot2 code to initialize it
g_base <- ggplot(newTestDate,mapping = aes(x=wday , y= Count_Reply, fill=wday))
g_base + geom_col()

Using scatterplot

ggplot(newTestDate,mapping = aes(x=newDate , y= Count_Reply, color=wday)) + geom_point()

Using facet_wrap

Using wrap function

# Use the code.
g_base + geom_col() + facet_wrap(~month(D_trump$newDate))

Using Theme

  ggplot(newTestDate)+
  geom_density(mapping=aes(x=Count_Reply ),alpha=.2, fill="#FF6666")+facet_wrap(~month(newTestDate$newDate)) +
    ggtitle("Reply by Month") +
  ylab("Count Reply") + xlab("Month") +
   theme(legend.position="top")

 ggplot( fct_count(newTestDate$month),mapping = aes(x= f, y=n )) + geom_col()

Using Density

  ggplot( fct_count(newTestDate$month))+
   geom_density(mapping=aes(x=n ),alpha=.2, fill="#FF6666")

Using Geom Smooth

  filter(newTestDate,wday == "Sun") %>%
  ggplot(mapping = aes(x=date(newDate) , y=Count_Retweet , color=wday)) + 
    geom_smooth(alpha=.2,span=.5, method = 'auto',  stat = "smooth")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Using smooth with method

   filter(newTestDate,wday %in% c("Sat","Sun","Mon","Tue")) %>%
  ggplot(mapping = aes(x=date(newDate) , y=Count_Retweet , color=wday)) +
  geom_smooth(method = lm, se = FALSE)

dplyr : Working with Group by

Checking Group By

  DT::datatable(newTestDate)

  # I want to list all Tweets by Month and day
  # We can user group_by to group our data. 
  # Note: Group_by has no impact on data untill we use with some other function to run aggregation on it. Like summarise 
  group_by(newTestDate,month,wday)%>%summarise(Tweetcount  = n())

  #Lets validate your result with table 
    table(newTestDate$month,newTestDate$wday)

##      
##       Sun Mon Tue Wed Thu Fri Sat
##   Jan  31  26  34  33  30  40  18
##   Feb  19  13  14  28  23  23  27
##   Mar   6  16  24  21  27  35  14
##   Apr  22  25  21  20  22  13  27
##   May  18  20  21  25  23  26  13
##   Jun  15  35  40  27  38  38  15
##   Jul  28  37  37  35  21  33  51
##   Aug  27  42  39  54  56  65  46
##   Sep  68  46 117  82  98  91  91
##   Oct  85 130  81 135 159  55  88
##   Nov  35  17  55  34  15  21  15
##   Dec  20  15  22  15  20  17  28

    group_by(newTestDate,month,wday)%>%summarise(Tweetcount  = sum(Count_Retweet))

    ungroup(newTestDate)

    # Lets take a long route to get the same data for our validation.
    # I'll Create a subset of data for MOnth = Jan and Weekday = Sun and then will do sum of count_retweet. 
    checkWeekDay <- filter(newTestDate,newTestDate$month=="Jan" & newTestDate$wday =="Sun")
    sum(checkWeekDay$Count_Retweet)

## [1] 719156

    # Wow, our data from last group_by output matches.

Working with filter

#---------------------------------

# filter()
# 

# List all the Tweet whcih has more than 100 reply 
filter(D_trump,D_trump$Count_Reply > 100)

stringr package

Find word

Find word starting with something

# Using Stringr lets say I want to know all the Tweets that were Retweeted. 
filter(D_trump,str_detect(D_trump$Tweets,"^RT"))

Find exact

# Check how many Time Border is as an issue got mention in tweet. 
filter(D_trump,str_detect(D_trump$Tweets," border|Border|BORDER"))

find all the Pattern

# We can use Filter to create SUBSET of data based on need
filter(D_trump,str_detect(D_trump$Tweets,"@[[:alnum:]]+"))

# We can also work on data column and store result for further processing 
At_Tweet<- str_extract_all(D_trump$Tweets,"@[[:alnum:]]+",simplify = TRUE)

tail(At_Tweet,15)

##         [,1]       [,2]     [,3]               [,4] [,5] [,6] [,7]
## [3218,] ""         ""       ""                 ""   ""   ""   ""  
## [3219,] ""         ""       ""                 ""   ""   ""   ""  
## [3220,] ""         ""       ""                 ""   ""   ""   ""  
## [3221,] ""         ""       ""                 ""   ""   ""   ""  
## [3222,] ""         ""       ""                 ""   ""   ""   ""  
## [3223,] "@CNN"     ""       ""                 ""   ""   ""   ""  
## [3224,] "@Trump"   "@Nigel" "@realDonaldTrump" ""   ""   ""   ""  
## [3225,] ""         ""       ""                 ""   ""   ""   ""  
## [3226,] "@Mike"    ""       ""                 ""   ""   ""   ""  
## [3227,] "@FoxNews" ""       ""                 ""   ""   ""   ""  
## [3228,] ""         ""       ""                 ""   ""   ""   ""  
## [3229,] ""         ""       ""                 ""   ""   ""   ""  
## [3230,] ""         ""       ""                 ""   ""   ""   ""  
## [3231,] ""         ""       ""                 ""   ""   ""   ""  
## [3232,] ""         ""       ""                 ""   ""   ""   ""

dplyr : Working with Join

full_join
inner_join
anti_join
left_join
right_join
nest_join
semi_join

Data

#----------------------JOIN
table1 <- D_trump[5:20,c("TweetURL","TweetDate")]
table2 <- D_trump[1:15,c("TweetURL","Tweets")]

glimpse(table1)

## Observations: 16
## Variables: 2
## $ TweetURL  <chr> "https://twitter.com/realDonaldTrump/status/92207065...
## $ TweetDate <chr> "10/22/17 12:02", "10/22/17 11:50", "10/22/17 0:09",...

glimpse(table2)

## Observations: 15
## Variables: 2
## $ TweetURL <chr> "https://twitter.com/realDonaldTrump/status/922440008...
## $ Tweets   <chr> "I had a very respectful conversation with the widow ...

Full_join

# URL in Unique for each Tweet
table_Full <- table1 %>% dplyr::full_join(table2, by =c("TweetURL" = "TweetURL"))
glimpse(table_Full)

## Observations: 20
## Variables: 3
## $ TweetURL  <chr> "https://twitter.com/realDonaldTrump/status/92207065...
## $ TweetDate <chr> "10/22/17 12:02", "10/22/17 11:50", "10/22/17 0:09",...
## $ Tweets    <chr> "Wacky Congresswoman Wilson is the gift that keeps o...

# Note how all the Data set got clubbed here , we can see NA for Text for some Entry and NA for Created_at for some entry
table_Full

inner_join

#Inner Join 
table_Inner <- inner_join(table1,table2, by =c("TweetURL" = "TweetURL"))

# Here table_Inner would have all the data that is common in table1 and Table2
table_Inner

anti_join

# if you want to list all the data from Table1, which is not in table2 we can use anti_join 
table_Anti <- dplyr::anti_join(table1,table2, by =c("TweetURL" = "TweetURL"))

table_Anti

left_join

# if you want to list all the data from Table1 i.e. Left table from below parameter and also wants to list all the matching data from right table i.e. table2, we can use left join
table_Left <- dplyr::left_join(table1,table2, by =c("TweetURL" = "TweetURL"))

table_Left

table1

right_join

# if you want to list all the data from Table2 i.e. RIGHT table from below parameter and also wants to list all the matching data from left table i.e. table1, we can use right_join
table_Right <- dplyr::right_join(table1,table2, by =c("TweetURL" = "TweetURL"))

table_Right

table2

nest_join

# Want to join table as nested tible use next_join 
table_Next <- dplyr::nest_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Next

# Joined table2 is added a tible . 
table_Next[1,]

table_Next$url[1]

## Warning: Unknown or uninitialised column: 'url'.

## NULL

table_Next$table2[[1]]

# We can use Keep to keep the column name 
table_NextKeep <- dplyr::nest_join(table1,table2, by =c("TweetURL" = "TweetURL"),keep= TRUE)
table_NextKeep[1,]

table_NextKeep$url[1]

## Warning: Unknown or uninitialised column: 'url'.

## NULL

table_NextKeep$table2[[1]]

semi_join

#return all rows from x (let table i.e. Table 1) where there are matching values in y (right table ie.e table 2) , keeping just columns from table1.
table_Semi <- dplyr::semi_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Semi

forcats package

#---------------------------------

# The forcats package
# forcats is a core package in the tidyverse. It is installed via install.packages("tidyverse") and attached via  library(tidyverse). You can always load it individually via library(forcats). Main functions start with  fct_. There really is no coherent family of base functions that forcats replaces - that's why it's such a welcome addition.


data("diamonds")

str(diamonds$cut)

##  Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...

# Same output with dplyr packge 

diamonds %>% 
  count(cut)

# Same output with forcats packge 

fct_count(diamonds$cut)

CLI package :

[31m——–[39m * CLI Package * [31m——–[39m

rule(center = " * CLI Package * ", line_col = "red")

## --------  * CLI Package *  --------

rule(line = "bar2")

## ___________________________________

rule(line = "dots2")

## dots2dots2dots2dots2dots2dots2dots2

rule(center = " * RESULTS * ", col = "red")

## ----------  * RESULTS *  ----------

list_spinners()

##  [1] "dots"                "dots2"               "dots3"              
##  [4] "dots4"               "dots5"               "dots6"              
##  [7] "dots7"               "dots8"               "dots9"              
## [10] "dots10"              "dots11"              "dots12"             
## [13] "line"                "line2"               "pipe"               
## [16] "simpleDots"          "simpleDotsScrolling" "star"               
## [19] "star2"               "flip"                "hamburger"          
## [22] "growVertical"        "growHorizontal"      "balloon"            
## [25] "balloon2"            "noise"               "bounce"             
## [28] "boxBounce"           "boxBounce2"          "triangle"           
## [31] "arc"                 "circle"              "squareCorners"      
## [34] "circleQuarters"      "circleHalves"        "squish"             
## [37] "toggle"              "toggle2"             "toggle3"            
## [40] "toggle4"             "toggle5"             "toggle6"            
## [43] "toggle7"             "toggle8"             "toggle9"            
## [46] "toggle10"            "toggle11"            "toggle12"           
## [49] "toggle13"            "arrow"               "arrow2"             
## [52] "arrow3"              "bouncingBar"         "bouncingBall"       
## [55] "smiley"              "monkey"              "hearts"             
## [58] "clock"               "earth"               "moon"               
## [61] "runner"              "pong"                "shark"              
## [64] "dqpb"

cli::boxx("Hello there!", padding = 1, float = "center")

##         +------------------+
##         |                  |
##         |   Hello there!   |
##         |                  |
##         +------------------+

TidyVerse

Rajwant Mishra

April 14, 2019