Assigning values to variables

use <- to assign a value to a variable

# Numbers
dusp_course <- 11

# Text
city_name <- "Cambridge"

# Boolean (true or false)
sanctuary_city <- TRUE

Displaying the contents of a variable

Putting a variable on a line by itself will display the contents of that variable no matter what type it is.

dusp_course
## [1] 11
city_name
## [1] "Cambridge"
sanctuary_city
## [1] TRUE

Applying a function to the contents of a variable

# A traditional mathematical function, square root

# First, just perform the function on a number and display the results
sqrt(27)
## [1] 5.196152
# Storing a number into a variable. Nothing gets displayed, but it is storedf or future use.
x <- 27

# We can now display what is in that variable
x
## [1] 27
# Using that stored variable, perform the function on it
sqrt(x)
## [1] 5.196152
# Now repeat, but store the result
y <- sqrt(x)

# And display the result
y
## [1] 5.196152

Performing some math on the contents of a variable

# We still have x from before
x
## [1] 27
# multiply by 3, and store in variable y. The previous value is overwritten.
y <- x * 3
y
## [1] 81

Apply a function that has a “parameter”

# Put the results of a calculation into the variable x
x <- 308 * 0.415
x
## [1] 127.82
# Round to the nearest whole number (0 digits from the decimal point)
d <- 0
y <- round(x, digits=d)
y
## [1] 128

Converting a variable that is text to a number

# We have this from before, but it is stored as a number "type"
dusp_course
## [1] 11
# Change it into text "type", and store it into a new variable
dusp_course2 <- as.character(dusp_course)
dusp_course2
## [1] "11"

Adding strings together using the paste0 function

Note: The paste function automatically adds spaces while paste0' does not. I prefer to always usepaste0` and manually put in the spaces if needed.

# We have this from before
dusp_course
## [1] 11
dusp_text <- "At MIT, DUSP is course:"
fulltext <- paste0(dusp_text,dusp_course)
fulltext
## [1] "At MIT, DUSP is course:11"
# Now with a space
fulltext <- paste0(dusp_text,' ',dusp_course)
fulltext
## [1] "At MIT, DUSP is course: 11"

Combining a list of objects together (called a “vector”) using the c function

# Making a list of numbers
x <- c(5,3,8,7,3)
x
## [1] 5 3 8 7 3
# Making a list of states
states <- c("MA","UT","MD","OH","ND")
states
## [1] "MA" "UT" "MD" "OH" "ND"
# You can ask for a specific item in your list, here the third item
states[3]
## [1] "MD"

Apply a manipulation to every item in a vector

# we have x from before
x
## [1] 5 3 8 7 3
# Multiply each item by 10
store <- x*10
store
## [1] 50 30 80 70 30

Creating a vector of number using a colon or the seq function

# The colon says make a list of number between 2 and 10
t2 <- c(2,3,4,5,6,7,8,9,10)
t <- 2:10
t2
## [1]  2  3  4  5  6  7  8  9 10
t
## [1]  2  3  4  5  6  7  8  9 10
# Do it manually
w <- c(2,4,6,8)
w
## [1] 2 4 6 8
# Now using the sequence function
w2 <- seq(2,1000,by=2)
w2
##   [1]    2    4    6    8   10   12   14   16   18   20   22   24   26   28   30
##  [16]   32   34   36   38   40   42   44   46   48   50   52   54   56   58   60
##  [31]   62   64   66   68   70   72   74   76   78   80   82   84   86   88   90
##  [46]   92   94   96   98  100  102  104  106  108  110  112  114  116  118  120
##  [61]  122  124  126  128  130  132  134  136  138  140  142  144  146  148  150
##  [76]  152  154  156  158  160  162  164  166  168  170  172  174  176  178  180
##  [91]  182  184  186  188  190  192  194  196  198  200  202  204  206  208  210
## [106]  212  214  216  218  220  222  224  226  228  230  232  234  236  238  240
## [121]  242  244  246  248  250  252  254  256  258  260  262  264  266  268  270
## [136]  272  274  276  278  280  282  284  286  288  290  292  294  296  298  300
## [151]  302  304  306  308  310  312  314  316  318  320  322  324  326  328  330
## [166]  332  334  336  338  340  342  344  346  348  350  352  354  356  358  360
## [181]  362  364  366  368  370  372  374  376  378  380  382  384  386  388  390
## [196]  392  394  396  398  400  402  404  406  408  410  412  414  416  418  420
## [211]  422  424  426  428  430  432  434  436  438  440  442  444  446  448  450
## [226]  452  454  456  458  460  462  464  466  468  470  472  474  476  478  480
## [241]  482  484  486  488  490  492  494  496  498  500  502  504  506  508  510
## [256]  512  514  516  518  520  522  524  526  528  530  532  534  536  538  540
## [271]  542  544  546  548  550  552  554  556  558  560  562  564  566  568  570
## [286]  572  574  576  578  580  582  584  586  588  590  592  594  596  598  600
## [301]  602  604  606  608  610  612  614  616  618  620  622  624  626  628  630
## [316]  632  634  636  638  640  642  644  646  648  650  652  654  656  658  660
## [331]  662  664  666  668  670  672  674  676  678  680  682  684  686  688  690
## [346]  692  694  696  698  700  702  704  706  708  710  712  714  716  718  720
## [361]  722  724  726  728  730  732  734  736  738  740  742  744  746  748  750
## [376]  752  754  756  758  760  762  764  766  768  770  772  774  776  778  780
## [391]  782  784  786  788  790  792  794  796  798  800  802  804  806  808  810
## [406]  812  814  816  818  820  822  824  826  828  830  832  834  836  838  840
## [421]  842  844  846  848  850  852  854  856  858  860  862  864  866  868  870
## [436]  872  874  876  878  880  882  884  886  888  890  892  894  896  898  900
## [451]  902  904  906  908  910  912  914  916  918  920  922  924  926  928  930
## [466]  932  934  936  938  940  942  944  946  948  950  952  954  956  958  960
## [481]  962  964  966  968  970  972  974  976  978  980  982  984  986  988  990
## [496]  992  994  996  998 1000

Apply a function to vector that contains a list of numbers

# Count the number of items in a vector
l <- length(x)
l
## [1] 5
# calculate the average (mean) of the values
m <- mean(x)
m
## [1] 5.2

Putting a few steps together to calculate the percent of the total

# Calculate the total
x
## [1] 5 3 8 7 3
# Calculate the total
t <- sum(x)
t
## [1] 26
# Calculate the percent of the total of each value in the vector
p <- x/t
p
## [1] 0.1923077 0.1153846 0.3076923 0.2692308 0.1153846

Label the items in a vector

# we have the variables x and states from before. There needs 
#to be the same number of items in both vectors.
x
## [1] 5 3 8 7 3
# assign the states as names to the items in the vector.  In essence column headers but it doesn't effect the numbers themselves.
names(x) <- states
x
## MA UT MD OH ND 
##  5  3  8  7  3

Make a barplot of the values in a vector

# Now that the values have names associated with them, they 
#will show up on the x-axis as labels.

barplot(x)

# Now with labels

barplot(x, xlab="State", ylab="# of students from this state", main="Where are they from?")

# REPEAT BUT MAKE HORIZONTAL AND CHANGE COLOR

# Google "r barplot" for help

Introduction to the “dataframe” data type

# Rivers is a built-in dataset which is a vector of numbers: 
#The lengths (in miles) of 141 "major" rivers in North America,
#as compiled by the US Geological Survey.
rivers
##   [1]  735  320  325  392  524  450 1459  135  465  600  330  336  280  315  870
##  [16]  906  202  329  290 1000  600  505 1450  840 1243  890  350  407  286  280
##  [31]  525  720  390  250  327  230  265  850  210  630  260  230  360  730  600
##  [46]  306  390  420  291  710  340  217  281  352  259  250  470  680  570  350
##  [61]  300  560  900  625  332 2348 1171 3710 2315 2533  780  280  410  460  260
##  [76]  255  431  350  760  618  338  981 1306  500  696  605  250  411 1054  735
##  [91]  233  435  490  310  460  383  375 1270  545  445 1885  380  300  380  377
## [106]  425  276  210  800  420  350  360  538 1100 1205  314  237  610  360  540
## [121] 1038  424  310  300  444  301  268  620  215  652  900  525  246  360  529
## [136]  500  720  270  430  671 1770
# You should think of a "dataframe" as as an excel spreadsheet
#with columns and rows
mydata <- data.frame(rivers)

Number of rows in a dataframe

nrow(mydata)
## [1] 141

Display only part of the data in a dataframe

# display the first six rows
head(mydata)
##   rivers
## 1    735
## 2    320
## 3    325
## 4    392
## 5    524
## 6    450
# display the first ten rows
head(mydata,10)
##    rivers
## 1     735
## 2     320
## 3     325
## 4     392
## 5     524
## 6     450
## 7    1459
## 8     135
## 9     465
## 10    600
# display the last six rows
tail(mydata)
##     rivers
## 136    500
## 137    720
## 138    270
## 139    430
## 140    671
## 141   1770

Basic stats about a dataframe

summary(mydata)
##      rivers      
##  Min.   : 135.0  
##  1st Qu.: 310.0  
##  Median : 425.0  
##  Mean   : 591.2  
##  3rd Qu.: 680.0  
##  Max.   :3710.0

Create a histogram of the data

# Send one column to the histogram function, let the function
#determine the number of buckets itself

hist(mydata$rivers)

#Choose the number of buckets yourself

hist(mydata$rivers, breaks=4)

Create a new column in the dataframe. Refer to columns using the $ sign

# Create a new column with kilometers instead of miles
mydata$riverskm <- mydata$rivers * 1.609
head(mydata)
##   rivers riverskm
## 1    735 1182.615
## 2    320  514.880
## 3    325  522.925
## 4    392  630.728
## 5    524  843.116
## 6    450  724.050

Make up some random numbers

# The water quality for each river is evaluated on a scale of
#1 to 100. Let's assign a random value for each river. We 
#need 141 random numbers.
s <- sample(1:100,141,replace=T)
s
##   [1]  30  43  99  26  96  29  73  66  68  60  32  83  33  32  72  33  61  35
##  [19]  13  90  54  56  66  16  44  48  36  87  36   3  35  18  20  77  87  84
##  [37]   3  47  77  79  53  60  92  37  38  38  94  33  50   3  26  11  36   3
##  [55]  59  96  64  85  76  15  77   2  63  61   5  58  54  97  58  23  50  88
##  [73]  16  32  85  41  47  89  58  66  20  54  94  54  38  70  65  28  40  92
##  [91]  17  47  39  91  99  70  12  36  30  86  32  86  11  45  68  26  77  93
## [109]  85  37  67  93  12  20  55  64  27  66  42  70  24  44  81  97  66  53
## [127]  81   8  68 100  37  86  15  78  67  22   1  84  98  40  36
# Make a new column in the dataframe with water quality
mydata$quality <- s
head(mydata)
##   rivers riverskm quality
## 1    735 1182.615      30
## 2    320  514.880      43
## 3    325  522.925      99
## 4    392  630.728      26
## 5    524  843.116      96
## 6    450  724.050      29

Plotting x,y values to investigate correlation

We want to investigate whether there is a correlation between the length of a river and its water quality. To do that, we want to make a scatter plot of the river lengths and the water quality.

plot(mydata$rivers,mydata$quality)

# Regression line (NOTE: it's y ~ x)
myline <- lm(mydata$quality ~ mydata$rivers)

# Add the line to the plot
abline(myline, col="red")