All<- tidyverse::tidyverse_packages()
cli::boxx("Hello there! Let's Explore Tidyverse Package", padding = 1, float = "center")
## +--------------------------------------------------+
## | |
## | Hello there! Let's Explore Tidyverse Package |
## | |
## +--------------------------------------------------+
print(All)
## [1] "broom" "cli" "crayon" "dplyr" "dbplyr"
## [6] "forcats" "ggplot2" "haven" "hms" "httr"
## [11] "jsonlite" "lubridate" "magrittr" "modelr" "purrr"
## [16] "readr" "readxl\n(>=" "reprex" "rlang" "rstudioapi"
## [21] "rvest" "stringr" "tibble" "tidyr" "xml2"
## [26] "tidyverse"
from https://fivethirtyeight.com/features/the-worst-tweeter-in-politics-isnt-trump/
https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv
There are many types of file and with readr package you can read your data directly in R using readr’r method.
read_csv() and read_tsv() are special cases of the general read_delim(). They’re useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively. read_csv2() uses ; for the field separator and , for the decimal point. This is common in some European countries.
obama <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/twitter-ratio/BarackObama.csv"
trump <- "https://github.com/fivethirtyeight/data/raw/master/twitter-ratio/realDonaldTrump.csv"
senators<- "https://github.com/fivethirtyeight/data/raw/master/twitter-ratio/senators.csv"
# Read in the dataset, we can user read_csv to read data from online or from local disk
D_obama <- read_csv(obama)
## Parsed with column specification:
## cols(
## created_at = col_character(),
## text = col_character(),
## url = col_character(),
## replies = col_double(),
## retweets = col_double(),
## favorites = col_double(),
## user = col_character()
## )
D_trump <- read_csv(trump)
## Parsed with column specification:
## cols(
## created_at = col_character(),
## text = col_character(),
## url = col_character(),
## replies = col_double(),
## retweets = col_double(),
## favorites = col_double(),
## user = col_character()
## )
D_senators <- read_csv(senators)
## Parsed with column specification:
## cols(
## created_at = col_character(),
## text = col_character(),
## url = col_character(),
## replies = col_double(),
## retweets = col_double(),
## favorites = col_double(),
## user = col_character(),
## bioguide_id = col_character(),
## party = col_character(),
## state = col_character()
## )
#I'll Work on only D_trump
# Let's take a look at what we have, We can use Glimse to see data with its data strcutre
str(D_trump)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3232 obs. of 7 variables:
## $ created_at: chr "10/23/17 12:30" "10/23/17 11:53" "10/23/17 11:42" "10/22/17 12:08" ...
## $ text : chr "I had a very respectful conversation with the widow of Sgt. La David Johnson, and spoke his name from beginning"| __truncated__ "Two dozen NFL players continue to kneel during the National Anthem, showing total disrespect to our Flag & "| __truncated__ "There will be NO change to your 401(k). This has always been a great and popular middle class tax break that wo"| __truncated__ "It is finally sinking through. 46% OF PEOPLE BELIEVE MAJOR NATIONAL NEWS ORGS FABRICATE STORIES ABOUT ME. FAKE "| __truncated__ ...
## $ url : chr "https://twitter.com/realDonaldTrump/status/922440008971292672" "https://twitter.com/realDonaldTrump/status/922430688703451136" "https://twitter.com/realDonaldTrump/status/922428118685581313" "https://twitter.com/realDonaldTrump/status/922072236592435200" ...
## $ replies : num 46228 31419 9552 56238 32136 ...
## $ retweets : num 10243 14006 13719 25102 21573 ...
## $ favorites : num 49468 62406 62662 112890 97145 ...
## $ user : chr "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" ...
## - attr(*, "spec")=
## .. cols(
## .. created_at = col_character(),
## .. text = col_character(),
## .. url = col_character(),
## .. replies = col_double(),
## .. retweets = col_double(),
## .. favorites = col_double(),
## .. user = col_character()
## .. )
head(D_trump)
#Lets add our Custom Column Name, there are many ways we can do it.
# We can change the name after reading the data set. Or We can read data and apply our name as we read it.
# It looks like we need to skip five lines, which will remove the column names
# So lets create a vector with column names
names <- c("TweetDate", "Tweets", "TweetURL", "Count_Reply", "Count_Retweet",
"Count_Fav", "UserID")
# And then try reading the file again
Dn_trump <- read_csv(trump,skip=1, col_names = names)
## Parsed with column specification:
## cols(
## TweetDate = col_character(),
## Tweets = col_character(),
## TweetURL = col_character(),
## Count_Reply = col_double(),
## Count_Retweet = col_double(),
## Count_Fav = col_double(),
## UserID = col_character()
## )
head(Dn_trump)
Lets say you want to force some column datatype before you read it, You can set type of column using col_types before you read in read_csv, i.e. c = character, i = integer, n = number, d = double, l = logical, f = factor, D = date, T = date time, t = time, ? = guess, or _/- to skip the column.
# Lets Try altername
names(D_trump) <- names
# Read in given Type of data
Type <- 'cccdddc'
read_csv(trump,col_types = Type )
#Chek data
head(D_trump)
# Lets say I want to Read Only 1st 2 Row of record
D_trump[1:2,]
#Read 1st TWO colum of data
D_trump[,1:2]
# Read 11 Row with 2nd and 3rd column only.
D_trump[11,2:3]
#Create a subset of Data with Count_Fav greater thsn 100k (100000)
D_trump_Fav_100k <- filter(D_trump,D_trump$Count_Fav > 100000)
head(D_trump_Fav_100k)
Want to read day, month, date, weekday, hours etc information from date time value. Lubridate gives handy function to do it: ### Read Date
#Lets Work With Date
# Read 1 Record of date
D_trump$TweetDate[1]
## [1] "10/23/17 12:30"
#Lets Split Date and Time
date_string <- str_split(D_trump$TweetDate[1]," ",n=2,simplify = TRUE)
date_string
## [,1] [,2]
## [1,] "10/23/17" "12:30"
data_date <- date_string[1,1]
data_date
## [1] "10/23/17"
data_time <- date_string[1,2]
data_time
## [1] "12:30"
new_date <- paste0(D_trump$TweetDate[1],":00")
new_date
## [1] "10/23/17 12:30:00"
lub_date <- mdy_hms(new_date)
lub_date
## [1] "2017-10-23 12:30:00 UTC"
# set Time Zone of the Date
tz(lub_date) <- "America/New_York"
#TO get Month
month(lub_date) # Will Return only Number
## [1] 10
month(lub_date,label = TRUE) # Will return Name of the month.
## [1] Oct
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
day(lub_date) # Which Days in MOnth
## [1] 23
days_in_month(lub_date) # How many months are in this Month
## Oct
## 31
# We can also extract some derived values such as the weekday
wday(lub_date)
## [1] 2
wday(lub_date, label = TRUE)
## [1] Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
# Changes time
with_tz(lub_date)
## [1] "2017-10-23 12:30:00 EDT"
with_tz(lub_date, "America/Chicago")
## [1] "2017-10-23 11:30:00 CDT"
with_tz(lub_date, "America/Los_Angeles")
## [1] "2017-10-23 09:30:00 PDT"
%within%
Tests Whether A Date Or Interval Falls Within An Interval recycled according to standard R rules. If b is a list of intervals, a is checked if it falls within any of the intervals in b. If a is an interval, both its start and end dates must fall within b to return TRUE.
## recycling
dates <- ymd(c("2014-12-20", "2014-12-30", "2015-01-01", "2015-01-03"))
blackouts<- c(interval(ymd("2014-12-30"), ymd("2014-12-31")),
interval(ymd("2014-12-30"), ymd("2015-01-03")))
dates %within% blackouts
## [1] FALSE TRUE FALSE TRUE
# lets Put all the date vlaues together
D_trump$newDate <- mdy_hms(paste0(D_trump$TweetDate,":00"))
tz(D_trump$newDate)<- "America/New_York"
D_trump$month <- month(D_trump$newDate ,label = TRUE)
D_trump$wday <- wday(D_trump$newDate, label = TRUE)
# create a subset of data with only valid data value
newTestDate <- D_trump[,c("newDate","month","wday","Count_Retweet","Count_Reply")]
# create the Base ggplot2 code to initialize it
g_base <- ggplot(newTestDate,mapping = aes(x=wday , y= Count_Reply, fill=wday))
g_base + geom_col()
ggplot(newTestDate,mapping = aes(x=newDate , y= Count_Reply, color=wday)) + geom_point()
Using wrap function
# Use the code.
g_base + geom_col() + facet_wrap(~month(D_trump$newDate))
ggplot(newTestDate)+
geom_density(mapping=aes(x=Count_Reply ),alpha=.2, fill="#FF6666")+facet_wrap(~month(newTestDate$newDate)) +
ggtitle("Reply by Month") +
ylab("Count Reply") + xlab("Month") +
theme(legend.position="top")
ggplot( fct_count(newTestDate$month),mapping = aes(x= f, y=n )) + geom_col()
ggplot( fct_count(newTestDate$month))+
geom_density(mapping=aes(x=n ),alpha=.2, fill="#FF6666")
filter(newTestDate,wday == "Sun") %>%
ggplot(mapping = aes(x=date(newDate) , y=Count_Retweet , color=wday)) +
geom_smooth(alpha=.2,span=.5, method = 'auto', stat = "smooth")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
filter(newTestDate,wday %in% c("Sat","Sun","Mon","Tue")) %>%
ggplot(mapping = aes(x=date(newDate) , y=Count_Retweet , color=wday)) +
geom_smooth(method = lm, se = FALSE)
DT::datatable(newTestDate)
# I want to list all Tweets by Month and day
# We can user group_by to group our data.
# Note: Group_by has no impact on data untill we use with some other function to run aggregation on it. Like summarise
group_by(newTestDate,month,wday)%>%summarise(Tweetcount = n())
#Lets validate your result with table
table(newTestDate$month,newTestDate$wday)
##
## Sun Mon Tue Wed Thu Fri Sat
## Jan 31 26 34 33 30 40 18
## Feb 19 13 14 28 23 23 27
## Mar 6 16 24 21 27 35 14
## Apr 22 25 21 20 22 13 27
## May 18 20 21 25 23 26 13
## Jun 15 35 40 27 38 38 15
## Jul 28 37 37 35 21 33 51
## Aug 27 42 39 54 56 65 46
## Sep 68 46 117 82 98 91 91
## Oct 85 130 81 135 159 55 88
## Nov 35 17 55 34 15 21 15
## Dec 20 15 22 15 20 17 28
group_by(newTestDate,month,wday)%>%summarise(Tweetcount = sum(Count_Retweet))
ungroup(newTestDate)
# Lets take a long route to get the same data for our validation.
# I'll Create a subset of data for MOnth = Jan and Weekday = Sun and then will do sum of count_retweet.
checkWeekDay <- filter(newTestDate,newTestDate$month=="Jan" & newTestDate$wday =="Sun")
sum(checkWeekDay$Count_Retweet)
## [1] 719156
# Wow, our data from last group_by output matches.
Find word starting with something
# Using Stringr lets say I want to know all the Tweets that were Retweeted.
filter(D_trump,str_detect(D_trump$Tweets,"^RT"))
# Check how many Time Border is as an issue got mention in tweet.
filter(D_trump,str_detect(D_trump$Tweets," border|Border|BORDER"))
# We can use Filter to create SUBSET of data based on need
filter(D_trump,str_detect(D_trump$Tweets,"@[[:alnum:]]+"))
# We can also work on data column and store result for further processing
At_Tweet<- str_extract_all(D_trump$Tweets,"@[[:alnum:]]+",simplify = TRUE)
tail(At_Tweet,15)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [3218,] "" "" "" "" "" "" ""
## [3219,] "" "" "" "" "" "" ""
## [3220,] "" "" "" "" "" "" ""
## [3221,] "" "" "" "" "" "" ""
## [3222,] "" "" "" "" "" "" ""
## [3223,] "@CNN" "" "" "" "" "" ""
## [3224,] "@Trump" "@Nigel" "@realDonaldTrump" "" "" "" ""
## [3225,] "" "" "" "" "" "" ""
## [3226,] "@Mike" "" "" "" "" "" ""
## [3227,] "@FoxNews" "" "" "" "" "" ""
## [3228,] "" "" "" "" "" "" ""
## [3229,] "" "" "" "" "" "" ""
## [3230,] "" "" "" "" "" "" ""
## [3231,] "" "" "" "" "" "" ""
## [3232,] "" "" "" "" "" "" ""
#----------------------JOIN
table1 <- D_trump[5:20,c("TweetURL","TweetDate")]
table2 <- D_trump[1:15,c("TweetURL","Tweets")]
glimpse(table1)
## Observations: 16
## Variables: 2
## $ TweetURL <chr> "https://twitter.com/realDonaldTrump/status/92207065...
## $ TweetDate <chr> "10/22/17 12:02", "10/22/17 11:50", "10/22/17 0:09",...
glimpse(table2)
## Observations: 15
## Variables: 2
## $ TweetURL <chr> "https://twitter.com/realDonaldTrump/status/922440008...
## $ Tweets <chr> "I had a very respectful conversation with the widow ...
# URL in Unique for each Tweet
table_Full <- table1 %>% dplyr::full_join(table2, by =c("TweetURL" = "TweetURL"))
glimpse(table_Full)
## Observations: 20
## Variables: 3
## $ TweetURL <chr> "https://twitter.com/realDonaldTrump/status/92207065...
## $ TweetDate <chr> "10/22/17 12:02", "10/22/17 11:50", "10/22/17 0:09",...
## $ Tweets <chr> "Wacky Congresswoman Wilson is the gift that keeps o...
# Note how all the Data set got clubbed here , we can see NA for Text for some Entry and NA for Created_at for some entry
table_Full
#Inner Join
table_Inner <- inner_join(table1,table2, by =c("TweetURL" = "TweetURL"))
# Here table_Inner would have all the data that is common in table1 and Table2
table_Inner
# if you want to list all the data from Table1, which is not in table2 we can use anti_join
table_Anti <- dplyr::anti_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Anti
# if you want to list all the data from Table1 i.e. Left table from below parameter and also wants to list all the matching data from right table i.e. table2, we can use left join
table_Left <- dplyr::left_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Left
table1
# if you want to list all the data from Table2 i.e. RIGHT table from below parameter and also wants to list all the matching data from left table i.e. table1, we can use right_join
table_Right <- dplyr::right_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Right
table2
# Want to join table as nested tible use next_join
table_Next <- dplyr::nest_join(table1,table2, by =c("TweetURL" = "TweetURL"))
table_Next
# Joined table2 is added a tible .
table_Next[1,]
table_Next$url[1]
## Warning: Unknown or uninitialised column: 'url'.
## NULL
table_Next$table2[[1]]
# We can use Keep to keep the column name
table_NextKeep <- dplyr::nest_join(table1,table2, by =c("TweetURL" = "TweetURL"),keep= TRUE)
table_NextKeep[1,]
table_NextKeep$url[1]
## Warning: Unknown or uninitialised column: 'url'.
## NULL
table_NextKeep$table2[[1]]
#---------------------------------
# The forcats package
# forcats is a core package in the tidyverse. It is installed via install.packages("tidyverse") and attached via library(tidyverse). You can always load it individually via library(forcats). Main functions start with fct_. There really is no coherent family of base functions that forcats replaces - that's why it's such a welcome addition.
data("diamonds")
str(diamonds$cut)
## Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
# Same output with dplyr packge
diamonds %>%
count(cut)
# Same output with forcats packge
fct_count(diamonds$cut)
[31m——–[39m * CLI Package * [31m——–[39m
rule(center = " * CLI Package * ", line_col = "red")
## -------- * CLI Package * --------
rule(line = "bar2")
## ___________________________________
rule(line = "dots2")
## dots2dots2dots2dots2dots2dots2dots2
rule(center = " * RESULTS * ", col = "red")
## ---------- * RESULTS * ----------
list_spinners()
## [1] "dots" "dots2" "dots3"
## [4] "dots4" "dots5" "dots6"
## [7] "dots7" "dots8" "dots9"
## [10] "dots10" "dots11" "dots12"
## [13] "line" "line2" "pipe"
## [16] "simpleDots" "simpleDotsScrolling" "star"
## [19] "star2" "flip" "hamburger"
## [22] "growVertical" "growHorizontal" "balloon"
## [25] "balloon2" "noise" "bounce"
## [28] "boxBounce" "boxBounce2" "triangle"
## [31] "arc" "circle" "squareCorners"
## [34] "circleQuarters" "circleHalves" "squish"
## [37] "toggle" "toggle2" "toggle3"
## [40] "toggle4" "toggle5" "toggle6"
## [43] "toggle7" "toggle8" "toggle9"
## [46] "toggle10" "toggle11" "toggle12"
## [49] "toggle13" "arrow" "arrow2"
## [52] "arrow3" "bouncingBar" "bouncingBall"
## [55] "smiley" "monkey" "hearts"
## [58] "clock" "earth" "moon"
## [61] "runner" "pong" "shark"
## [64] "dqpb"
cli::boxx("Hello there!", padding = 1, float = "center")
## +------------------+
## | |
## | Hello there! |
## | |
## +------------------+