Assigning values to variables

use <- to assign a value to a variable

# Numbers
dusp_course <- 11

# Text
city_name <- "Cambridge"

# Boolean (true or false)
sanctuary_city <- TRUE

Displaying the contents or a variable

# Putting a variable on a line by itself will display the contents of that variable no matter what type it is.
dusp_course
## [1] 11
city_name
## [1] "Cambridge"
sanctuary_city
## [1] TRUE

Applying a function to the contents of a variable

# A traditional mathematical function, square root
x <- 27
y <- sqrt(x)
y
## [1] 5.196152

Performing some math on the contents of a variable

# We still have xfrom before
x
## [1] 27
# multiply by 3
y  <- x * 3
y
## [1] 81

Apply a function that has a “parameter”

# Put the result of a calculation into the variable x
x <- 308 * 0.415
x
## [1] 127.82
# Round to the nearest whole number (0 digits from the decimal point)
y < round(x, digits=0)
## [1] TRUE
y
## [1] 81

Converting a variable that is text to a number

# We have this from before but as a number
dusp_course
## [1] 11
# Change it into text, and store it into a new variable
dusp_course2 <- as.character(dusp_course)
dusp_course2
## [1] "11"

Adding strings together using the paste0 function

dusp_text <- "At MIT, DUSP is course: "
fulltext <- paste0(dusp_text, dusp_course2)
fulltext
## [1] "At MIT, DUSP is course: 11"

Combinning a list of numbers (called a “vector”) using the c function

x <- c(5,3,8,7,3)
x
## [1] 5 3 8 7 3
states <- c("MA", "UT", "MD", "OH", "ND")
states
## [1] "MA" "UT" "MD" "OH" "ND"

Apply a manipulation to every item in a vector

# we have x from before
x
## [1] 5 3 8 7 3
# Multiply each item by 10
x * 10
## [1] 50 30 80 70 30

Creating a vector of number using a :

# The colun says make a list of numbers between 2 and 10
t <- 2:10
t
## [1]  2  3  4  5  6  7  8  9 10
w <- c(2,4,6,8)
w2 <- seq(2,100,by=2)
w2
##  [1]   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30  32  34
## [18]  36  38  40  42  44  46  48  50  52  54  56  58  60  62  64  66  68
## [35]  70  72  74  76  78  80  82  84  86  88  90  92  94  96  98 100

Apply a function to a list of numbers

# Count the number of items in a vector
length(x)
## [1] 5
#calculate the average (mean) of the values
m <- mean(x)
m
## [1] 5.2

Putting a few steps together to calculate the percent of the total

# Calculate the total
t <- sum(x)
t
## [1] 26
#calculate the percentage of the total of each value in the vector
p <- x/t
p
## [1] 0.1923077 0.1153846 0.3076923 0.2692308 0.1153846

Label the items in a vector

# we have the variables x and states from before. There needs to be the same number of items in both vectors
x
## [1] 5 3 8 7 3
# assign the states as names to the items in the vector
names(x) <- states
x
## MA UT MD OH ND 
##  5  3  8  7  3

Make a barplot of the values in a vector

# now that the values have names associates with them, they will show up on the x-axis as labels
barplot(x)

Introduction to the “dataframe” data type

# Rivers is a built-in dataset which is a vector of numbers: 
# The length (in miles) of 141 "major" rivers in North America, as compiled by the US Geological Survey.
rivers
##   [1]  735  320  325  392  524  450 1459  135  465  600  330  336  280  315
##  [15]  870  906  202  329  290 1000  600  505 1450  840 1243  890  350  407
##  [29]  286  280  525  720  390  250  327  230  265  850  210  630  260  230
##  [43]  360  730  600  306  390  420  291  710  340  217  281  352  259  250
##  [57]  470  680  570  350  300  560  900  625  332 2348 1171 3710 2315 2533
##  [71]  780  280  410  460  260  255  431  350  760  618  338  981 1306  500
##  [85]  696  605  250  411 1054  735  233  435  490  310  460  383  375 1270
##  [99]  545  445 1885  380  300  380  377  425  276  210  800  420  350  360
## [113]  538 1100 1205  314  237  610  360  540 1038  424  310  300  444  301
## [127]  268  620  215  652  900  525  246  360  529  500  720  270  430  671
## [141] 1770
# You should think of a "dataframe" as an excel spreadsheet with columns and rows
mydata <- data.frame(rivers)
mydata
##     rivers
## 1      735
## 2      320
## 3      325
## 4      392
## 5      524
## 6      450
## 7     1459
## 8      135
## 9      465
## 10     600
## 11     330
## 12     336
## 13     280
## 14     315
## 15     870
## 16     906
## 17     202
## 18     329
## 19     290
## 20    1000
## 21     600
## 22     505
## 23    1450
## 24     840
## 25    1243
## 26     890
## 27     350
## 28     407
## 29     286
## 30     280
## 31     525
## 32     720
## 33     390
## 34     250
## 35     327
## 36     230
## 37     265
## 38     850
## 39     210
## 40     630
## 41     260
## 42     230
## 43     360
## 44     730
## 45     600
## 46     306
## 47     390
## 48     420
## 49     291
## 50     710
## 51     340
## 52     217
## 53     281
## 54     352
## 55     259
## 56     250
## 57     470
## 58     680
## 59     570
## 60     350
## 61     300
## 62     560
## 63     900
## 64     625
## 65     332
## 66    2348
## 67    1171
## 68    3710
## 69    2315
## 70    2533
## 71     780
## 72     280
## 73     410
## 74     460
## 75     260
## 76     255
## 77     431
## 78     350
## 79     760
## 80     618
## 81     338
## 82     981
## 83    1306
## 84     500
## 85     696
## 86     605
## 87     250
## 88     411
## 89    1054
## 90     735
## 91     233
## 92     435
## 93     490
## 94     310
## 95     460
## 96     383
## 97     375
## 98    1270
## 99     545
## 100    445
## 101   1885
## 102    380
## 103    300
## 104    380
## 105    377
## 106    425
## 107    276
## 108    210
## 109    800
## 110    420
## 111    350
## 112    360
## 113    538
## 114   1100
## 115   1205
## 116    314
## 117    237
## 118    610
## 119    360
## 120    540
## 121   1038
## 122    424
## 123    310
## 124    300
## 125    444
## 126    301
## 127    268
## 128    620
## 129    215
## 130    652
## 131    900
## 132    525
## 133    246
## 134    360
## 135    529
## 136    500
## 137    720
## 138    270
## 139    430
## 140    671
## 141   1770

Number of rows in a dataframe

nrow(mydata)
## [1] 141

Display only part of the data in a dataframe

# display the first six rows
head(mydata)
##   rivers
## 1    735
## 2    320
## 3    325
## 4    392
## 5    524
## 6    450
# display the first ten rows
head(mydata,10)
##    rivers
## 1     735
## 2     320
## 3     325
## 4     392
## 5     524
## 6     450
## 7    1459
## 8     135
## 9     465
## 10    600
# Display the last six rows
tail(mydata)
##     rivers
## 136    500
## 137    720
## 138    270
## 139    430
## 140    671
## 141   1770

Basic stats about a dataframe

summary(mydata)
##      rivers      
##  Min.   : 135.0  
##  1st Qu.: 310.0  
##  Median : 425.0  
##  Mean   : 591.2  
##  3rd Qu.: 680.0  
##  Max.   :3710.0

Create a histogram of the data

# Send one column to the histogram function, let the function
#determine the number of buckets itself
hist(mydata$rivers)

# Choose the number of buckets yourself
hist(mydata$rivers, breaks=4)

Create a new column in the dataframe. Refer to columns using the $ sign

# Create a new columns with kilometers instead of miles
mydata$riverskm <- mydata$rivers * 1.609
head(mydata)
##   rivers riverskm
## 1    735 1182.615
## 2    320  514.880
## 3    325  522.925
## 4    392  630.728
## 5    524  843.116
## 6    450  724.050

Make up some numbers

# The water quality for each river is evluated on a scale of 1 to 100. 
# Let's assign a random value for each river. We need 141 random numbers.
s <- sample(1:100,141,replace=T)
s
##   [1]  8  5 68 30 40 18 34 92 37 59 64 84 81 83  2 65 19 22 13 37  7 11  8
##  [24] 51 54 97 34  4 12 21 15 22 28 61 66 14 76 48 79 80 51 87 84 18 86 42
##  [47]  5 12 39  7 38 83 81 55  7 85 17 46 85 31 82 97 50 41 54 74 36 88  8
##  [70] 80 57 24  4 12 74 23 71 69 97 85 69 39 66 94 79 56 73  1 89 95 99  2
##  [93] 96 12 27 81 66 62 44 84 64 28 87 78 28 33 43 23  9 28 76 79 28 91 76
## [116] 40 33 25 33 15  9 88  5 83  9  9 51 41 23 42 82 24 75 17  2  9 48 35
## [139] 24 93  3
# Make a new column in the dataframe with water quality
mydata$quality <- s
head(mydata)
##   rivers riverskm quality
## 1    735 1182.615       8
## 2    320  514.880       5
## 3    325  522.925      68
## 4    392  630.728      30
## 5    524  843.116      40
## 6    450  724.050      18

Plotting x,y values to investigate correlation

# We want to investigate whether there is a correlation between the lenght of a river and its water quality.
# To do that, we want to make a scatter plot of the river lengths and the water quality.
plot(mydata$rivers,mydata$quality)