#create two vectors
test_scores <- c(92, 75, 84, 94, 88, 89, 91)
students <- c("Jerry", "Monica", "Felix", "James", "April", "Ruth", "Tony")
#create a dataframe
grades <- data.frame(test_scores,students)
grades
## test_scores students
## 1 92 Jerry
## 2 75 Monica
## 3 84 Felix
## 4 94 James
## 5 88 April
## 6 89 Ruth
## 7 91 Tony
class(grades)
## [1] "data.frame"
#create a matrix(by mistake)
gradesmatrix <- cbind(test_scores, students)
class(gradesmatrix)
## [1] "matrix" "array"
#create a tibble
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
grades <- tibble(test_scores, students)
grades
## # A tibble: 7 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 75 Monica
## 3 84 Felix
## 4 94 James
## 5 88 April
## 6 89 Ruth
## 7 91 Tony
#update a value
grades$test_scores[2] <- NA
grades
## # A tibble: 7 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 NA Monica
## 3 84 Felix
## 4 94 James
## 5 88 April
## 6 89 Ruth
## 7 91 Tony
#query
grades$students[grades$test_scores >= 90]
## [1] "Jerry" NA "James" "Tony"
#query to show both students and grades >=90
grades[grades$test_scores >=90, ] #return all columns
## # A tibble: 4 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 NA <NA>
## 3 94 James
## 4 91 Tony
#query non NA values
complete.cases(grades)
## [1] TRUE FALSE TRUE TRUE TRUE TRUE TRUE
grades[complete.cases(grades),]
## # A tibble: 6 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 84 Felix
## 3 94 James
## 4 88 April
## 5 89 Ruth
## 6 91 Tony
grades <- grades[complete.cases(grades),]
grades[grades$test_scores >=90, ]
## # A tibble: 3 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 94 James
## 3 91 Tony
#sort in descending order
grades[order(-test_scores, students),1:2] #omitting 1:2 will default to showing all columns
## # A tibble: 7 x 2
## test_scores students
## <dbl> <chr>
## 1 88 April
## 2 92 Jerry
## 3 NA <NA>
## 4 91 Tony
## 5 89 Ruth
## 6 94 James
## 7 84 Felix
cleangrades<- complete.cases(grades)
# sort in ascending order and specify the columns to show
grades[order(test_scores),1]
## # A tibble: 7 x 1
## test_scores
## <dbl>
## 1 84
## 2 94
## 3 89
## 4 91
## 5 NA
## 6 92
## 7 88
#or
library(dplyr)
arrange(grades, -test_scores, )
## # A tibble: 6 x 2
## test_scores students
## <dbl> <chr>
## 1 94 James
## 2 92 Jerry
## 3 91 Tony
## 4 89 Ruth
## 5 88 April
## 6 84 Felix
filter(grades, test_scores >= 85)
## # A tibble: 5 x 2
## test_scores students
## <dbl> <chr>
## 1 92 Jerry
## 2 94 James
## 3 88 April
## 4 89 Ruth
## 5 91 Tony
grades %>% select(test_scores)
## # A tibble: 6 x 1
## test_scores
## <dbl>
## 1 92
## 2 84
## 3 94
## 4 88
## 5 89
## 6 91
grades %>% select(test_scores) %>% filter(test_scores >= 85)
## # A tibble: 5 x 1
## test_scores
## <dbl>
## 1 92
## 2 94
## 3 88
## 4 89
## 5 91
library(readr)
options(scipen=999) #convert from sn
gdp <- read_csv("gdp.csv")
Use the square brackets. Separate by rows and columns
gdp[1,] #Shows first row with all variables
## # A tibble: 1 x 60
## `Country Name` `Country Code` `1960` `1961` `1962` `1963` `1964` `1965` `1966`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba ABW NA NA NA NA NA NA NA
## # … with 51 more variables: 1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>,
## # 1971 <dbl>, 1972 <dbl>, 1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>,
## # 1977 <dbl>, 1978 <dbl>, 1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>,
## # 1983 <dbl>, 1984 <dbl>, 1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>,
## # 1989 <dbl>, 1990 <dbl>, 1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>,
## # 1995 <dbl>, 1996 <dbl>, 1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>,
## # 2001 <dbl>, 2002 <dbl>, 2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>,
## # 2007 <dbl>, 2008 <dbl>, 2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>,
## # 2013 <dbl>, 2014 <dbl>, 2015 <dbl>, 2016 <dbl>, 2017 <dbl>
gdp[,1] #shows Country Name variable and all rows
## # A tibble: 264 x 1
## `Country Name`
## <chr>
## 1 Aruba
## 2 Afghanistan
## 3 Angola
## 4 Albania
## 5 Andorra
## 6 Arab World
## 7 United Arab Emirates
## 8 Argentina
## 9 Armenia
## 10 American Samoa
## # … with 254 more rows
c()gdp[c(1:6),c(1,2,57:60)] #rows 1:6, columns 1,2, and 57 through 60 inclusive
## # A tibble: 6 x 6
## `Country Name` `Country Code` `2014` `2015` `2016` `2017`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba ABW NA NA NA NA
## 2 Afghanistan AFG 2.06e10 1.92e10 1.95e10 2.08e10
## 3 Angola AGO 1.27e11 1.03e11 9.53e10 1.24e11
## 4 Albania ALB 1.32e10 1.14e10 1.19e10 1.30e10
## 5 Andorra AND 3.35e 9 2.81e 9 2.88e 9 3.01e 9
## 6 Arab World ARB 2.91e12 2.55e12 2.50e12 2.59e12
gdp$silly_column <- NA #
print(gdp$newcolumn)
## Warning: Unknown or uninitialised column: `newcolumn`.
## NULL
Check to see that a new variable was added
dim(gdp)
## [1] 264 61
gdp["anothercolumn"] <- NA #
print(gdp$anothercolumn)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [126] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [176] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [201] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [226] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [251] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
dim(gdp)
## [1] 264 62
http://www.sthda.com/english/wiki/reordering-data-frame-columns-in-r
gdp2 <- gdp[, c(1, 2:62)]
gdp2
## # A tibble: 264 x 62
## `Country Name` `Country Code` `1960` `1961` `1962` `1963` `1964`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba ABW NA NA NA NA NA
## 2 Afghanistan AFG 5.38e8 5.49e8 5.47e 8 7.51e 8 8.00e 8
## 3 Angola AGO NA NA NA NA NA
## 4 Albania ALB NA NA NA NA NA
## 5 Andorra AND NA NA NA NA NA
## 6 Arab World ARB NA NA NA NA NA
## 7 United Arab Emi… ARE NA NA NA NA NA
## 8 Argentina ARG NA NA 2.45e10 1.83e10 2.56e10
## 9 Armenia ARM NA NA NA NA NA
## 10 American Samoa ASM NA NA NA NA NA
## # … with 254 more rows, and 55 more variables: 1965 <dbl>, 1966 <dbl>,
## # 1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>, 1971 <dbl>, 1972 <dbl>,
## # 1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>, 1977 <dbl>, 1978 <dbl>,
## # 1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>, 1983 <dbl>, 1984 <dbl>,
## # 1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>, 1989 <dbl>, 1990 <dbl>,
## # 1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>, 1995 <dbl>, 1996 <dbl>,
## # 1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>, 2001 <dbl>, 2002 <dbl>,
## # 2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>, 2007 <dbl>, 2008 <dbl>,
## # 2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>, 2013 <dbl>, 2014 <dbl>,
## # 2015 <dbl>, 2016 <dbl>, 2017 <dbl>, silly_column <lgl>, anothercolumn <lgl>
library(dplyr )
gdp2 <- select(gdp2, -anothercolumn)
dim(gdp2)
## [1] 264 61
str(gdp2)
## tibble[,61] [264 × 61] (S3: tbl_df/tbl/data.frame)
## $ Country Name: chr [1:264] "Aruba" "Afghanistan" "Angola" "Albania" ...
## $ Country Code: chr [1:264] "ABW" "AFG" "AGO" "ALB" ...
## $ 1960 : num [1:264] NA 537777811 NA NA NA ...
## $ 1961 : num [1:264] NA 548888896 NA NA NA ...
## $ 1962 : num [1:264] NA 546666678 NA NA NA ...
## $ 1963 : num [1:264] NA 751111191 NA NA NA ...
## $ 1964 : num [1:264] NA 800000044 NA NA NA ...
## $ 1965 : num [1:264] NA 1006666638 NA NA NA ...
## $ 1966 : num [1:264] NA 1399999967 NA NA NA ...
## $ 1967 : num [1:264] NA 1673333418 NA NA NA ...
## $ 1968 : num [1:264] NA 1373333367 NA NA NA ...
## $ 1969 : num [1:264] NA 1408888922 NA NA NA ...
## $ 1970 : num [1:264] NA 1748886596 NA NA 78619206 ...
## $ 1971 : num [1:264] NA 1831108971 NA NA 89409820 ...
## $ 1972 : num [1:264] NA 1595555476 NA NA 113408232 ...
## $ 1973 : num [1:264] NA 1733333264 NA NA 150820103 ...
## $ 1974 : num [1:264] NA 2155555498 NA NA 186558696 ...
## $ 1975 : num [1:264] NA 2366666616 NA NA 220127246 ...
## $ 1976 : num [1:264] NA 2555555567 NA NA 227281025 ...
## $ 1977 : num [1:264] NA 2953333418 NA NA 254020153 ...
## $ 1978 : num [1:264] NA 3300000109 NA NA 308008898 ...
## $ 1979 : num [1:264] NA 3697940410 NA NA 411578334 ...
## $ 1980 : num [1:264] NA 3641723322 5930503401 NA 446416106 ...
## $ 1981 : num [1:264] NA 3478787909 5550483036 NA 388958731 ...
## $ 1982 : num [1:264] NA NA 5550483036 NA 375895956 ...
## $ 1983 : num [1:264] NA NA 5784341596 NA 327861833 ...
## $ 1984 : num [1:264] NA NA 6131475065 1924242453 330070689 ...
## $ 1985 : num [1:264] NA NA 7553560459 1965384586 346737965 ...
## $ 1986 : num [1:264] NA NA 7072063345 2173750012 482000594 ...
## $ 1987 : num [1:264] NA NA 8083872012 2156624900 611316399 ...
## $ 1988 : num [1:264] NA NA 8769250550 2126000000 721425939 ...
## $ 1989 : num [1:264] NA NA 10201099040 2335124988 795449332 ...
## $ 1990 : num [1:264] NA NA 11228764963 2101624962 1029048482 ...
## $ 1991 : num [1:264] NA NA 10603784541 1139166646 1106928583 ...
## $ 1992 : num [1:264] NA NA 8307810974 709452584 1210013652 ...
## $ 1993 : num [1:264] NA NA 5768720422 1228071038 1007025755 ...
## $ 1994 : num [1:264] 1330167598 NA 4438321017 1985673798 1017549124 ...
## $ 1995 : num [1:264] 1320670391 NA 5538749260 2424499009 1178738991 ...
## $ 1996 : num [1:264] 1379888268 NA 7526446606 3314898292 1223945357 ...
## $ 1997 : num [1:264] 1531843575 NA 7648377413 2359903108 1180597273 ...
## $ 1998 : num [1:264] 1665363128 NA 6506229607 2707123772 1211932398 ...
## $ 1999 : num [1:264] 1722798883 NA 6152922943 3414760915 1239876305 ...
## $ 2000 : num [1:264] 1873452514 NA 9129594819 3632043908 1434429703 ...
## $ 2001 : num [1:264] 1920262570 2461665938 8936063723 4060758804 1496912752 ...
## $ 2002 : num [1:264] 1941094972 4128820723 12497347956 4435078648 1733116883 ...
## $ 2003 : num [1:264] 2021301676 4583644246 14188949398 5746945913 2398645598 ...
## $ 2004 : num [1:264] 2228279330 5285465686 19640853734 7314865176 2935659300 ...
## $ 2005 : num [1:264] 2331005587 6275073572 28233712738 8158548717 3255789081 ...
## $ 2006 : num [1:264] 2421474860 7057598407 41789479932 8992642349 3543256806 ...
## $ 2007 : num [1:264] 2623726257 9843842455 60448924662 10701011897 4016972351 ...
## $ 2008 : num [1:264] 2791960894 10190529882 84178035579 12881352688 4007353157 ...
## $ 2009 : num [1:264] 2498932961 12486943506 75492385928 12044212904 3660530703 ...
## $ 2010 : num [1:264] 2467703911 15936800636 82526143645 11926953259 3355695364 ...
## $ 2011 : num [1:264] 2584463687 17930239400 104115807986 12890867539 3442062830 ...
## $ 2012 : num [1:264] NA 20536542737 113923162050 12319784787 3164615187 ...
## $ 2013 : num [1:264] NA 20264253974 124912503781 12776277515 3281585236 ...
## $ 2014 : num [1:264] NA 20616104298 126730196125 13228244357 3350736367 ...
## $ 2015 : num [1:264] NA 19215562179 102621215573 11386931490 2811489409 ...
## $ 2016 : num [1:264] NA 19469022208 95337203468 11883682171 2877311947 ...
## $ 2017 : num [1:264] NA 20815300220 124209385825 13039352744 3012914131 ...
## $ silly_column: logi [1:264] NA NA NA NA NA NA ...
summary(gdp2)
This is the basic logarithm function with 9 as the value and 3 as the base. The results are 2 because 9 is the square of 3.
# log in r - core syntax
log(9,3)
## [1] 2
Here, the second parameter has been omitted resulting in a base of e producing the natural logarithm of 5.
log(5)
## [1] 1.609438
log(5,2.718)
## [1] 1.609605
exp(1.609438)
## [1] 5
A log transformation is a process of applying a logarithm to data to reduce its skew. This is usually done when the numbers are highly skewed to reduce the skew so the data can be understood easier. Log transformation in R is accomplished by applying the log() function to vector, data-frame or other data set. Before the logarithm is applied, 1 is added to the base value to prevent applying a logarithm to a 0 value. The resulting presentation of the data is less skewed than the original making it easier to understand.
myvector = c(100,10,5,2,1,0.5,0.1,0.05,0.01,0.001,0.0001)
transformedvector=log10(myvector+1)
plot(myvector)
plot(transformedvector)
Log transforming your data in R for a data frame is a little trickier because getting the log requires separating the data. Taking the log of the entire dataset gets you the log of each data point. However, you usually need the log from only one column of data.
ChickWeight$logweight=log(ChickWeight$weight)
#head(ChickWeight)
plot(head(ChickWeight$Time),head(ChickWeight$logweight))
plot(head(ChickWeight$Time),head(ChickWeight$weight))
boxplot(gdp$`2017`[1:10])
#boxplot(gdp$`2017`)
gdp[,3:60] <- log(gdp[,3:60] + 1)
#(or use log1p(x) computes log(1+x) accurately)
Learn more about using the natural logarithm with economic data: https://econbrowser.com/archives/2014/02/use-of-logarithms-in-economics
gdp <- read_csv("gdp.csv")
uae <- subset(gdp, `Country Code` == "ARE")
library(dplyr)
uae2 = filter(gdp, `Country Code` == "ARE")
uae2
## # A tibble: 1 x 60
## `Country Name` `Country Code` `1960` `1961` `1962` `1963` `1964` `1965` `1966`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 United Arab E… ARE NA NA NA NA NA NA NA
## # … with 51 more variables: 1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>,
## # 1971 <dbl>, 1972 <dbl>, 1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>,
## # 1977 <dbl>, 1978 <dbl>, 1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>,
## # 1983 <dbl>, 1984 <dbl>, 1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>,
## # 1989 <dbl>, 1990 <dbl>, 1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>,
## # 1995 <dbl>, 1996 <dbl>, 1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>,
## # 2001 <dbl>, 2002 <dbl>, 2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>,
## # 2007 <dbl>, 2008 <dbl>, 2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>,
## # 2013 <dbl>, 2014 <dbl>, 2015 <dbl>, 2016 <dbl>, 2017 <dbl>
gdp_multiple= subset(gdp, `Country Code` %in% c("ARE", "CHN", "GBR"))
gdp_multiple2 = filter(gdp, `Country Code` == "ARE" | `Country Code` == "CHN" | `Country Code` == "GBR")
gather function.library(dplyr)
library(tidyverse)
gdp_multiple2_melt <-
gdp_multiple2 %>%
gather(3:60, key="year", value="gdp")
gdp_multiple2_melt$year <- as.numeric(gdp_multiple2_melt$year)
library(ggthemes)
library(ggplot2)
library (lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
options(scipen=999)
ggplot(gdp_multiple2_melt,aes(x=year,y=gdp, colour=`Country Name` ,group=`Country Name`)) + geom_line() + labs(title = "GDP from 1960 through 2007", caption = "Source: World Bank (2021)",x = "Year",y ="GDP") + theme_economist()
## Warning: Removed 15 row(s) containing missing values (geom_path).
library(ggthemes)
library(ggplot2)
library (lubridate)
options(scipen=999)
ggplot(gdp_multiple2_melt,aes(x=year,y=log1p(gdp), colour=`Country Name` ,group=`Country Name`)) + geom_line() + labs(title = "GDP from 1960 through 2007", caption = "Source: World Bank (2021)",x = "Year",y ="GDP") + theme_economist()
## Warning: Removed 15 row(s) containing missing values (geom_path).