01_baseRdataHandling/day1.R
0 → 100644
View file @
8c82bce8
# basic R usage
4
+
6
x
<
6
y
<
4
x
+
y
# list all variables in your environment
ls
()
sqrt
(
16
)
# to remove a variable
rm
(
x
)
z
<
c
(
5
,
9
,
1
,
0
)
x
<
c
(
5
,
9
)
# combine two vectors
combinedVector
<
c
(
z
,
x
)
# another to create a vector is to use seq()
seq
(
1
,
9
,
by
=
2
)
seq
(
8
,
20
,
length
=
6
)
# R will handle vectors for vector arithmatic
x
<
seq
(
1
,
5
,
by
=
1
)
y
<
1
:
5
# adding two vectors will return a vector of sums
x
+
y
# add a vector and a numeric value
x
+
3
x
shortVector
<
c
(
1
,
3
)
longVector
<
1
:
6
shortVector
+
longVector
longVector
midVector
<
1
:
5
midVector
midVector
+
longVector
# some useful functions that work with vectors
length
(
midVector
)
mean
(
midVector
)
summary
(
longVector
)
min
(
longVector
)
max
(
longVector
,
shortVector
)
?
seq
()
# how to subset vectors
x
<
c
(
7.5
,
8.2
,
3.1
,
5.6
,
8.2
,
9.3
,
6.5
,
7.0
,
9.3
,
1.2
,
14.5
,
6.2
)
mean
(
x
)
x
[
1
]
x
[
c
(
1
,
5
,
8
)]
x
[
1
:
5
]
head
(
x
)
# arrange values in a vector
sort
(
x
)
sort
(
x
=
x
,
decreasing
=
TRUE
)
sort
(
x
,
TRUE
)
sort
(
longVector
,
decreasing
=
TRUE
)
sort
(
x
,
decreasing
)
# this will cause error
sort
(
TRUE
,
x
)
# this will cause confusing error
# this is why it's safer to use named parameters
sort
(
x
=
x
,
decreasing
=
TRUE
)
# best way to call this function
sort
(
x
=
longVector
,
decreasing
=
TRUE
)
# Data types
# numerics
9
a
<
9
# is. functions to test for the data type
is.numeric
(
a
)
# character
myChar
<
"t"
is.numeric
(
myChar
)
is.character
(
myChar
)
# logicals
TRUE
FALSE
myLgl
<
TRUE
is.logical
(
myLgl
)
# save numbers as characters
myCharNum
<
"9"
is.numeric
(
myCharNum
)
# change the type of an object
# called "coersion"
as.numeric
(
myCharNum
)
# no quotes means that an object is a number
# quotes means that it is a character
myCoercedNum
<
as.numeric
(
myCharNum
)
# function to find classes or data types
class
(
myChar
)
typeof
(
myChar
)
str
(
myChar
)
# Matrix
myMatrix
<
matrix
(
data
=
c
(
5
,
7
,
9
,
3
,
4
,
6
),
nrow
=
3
)
myVec1
<
3
:
9
myVec2
<
13
:
19
cbind
(
myVec1
,
myVec2
)
myMatrix
*
2
myMatrix
# Pull data out of matrix similarly to vectors
myMatrix
[
1
,
1
]
myMatrix
[
1
,
c
(
1
,
2
)]
# short cut
myMatrix
[,
3
]
myMatrix
[,
2
]
# to pull out values from a matrix
# use [rows,columns]
myMatrix
[
1
,]
# this will print everything except the first row
myMixedVector
<
c
(
1
,
2
,
4
,
"a"
)
# what if we want to hold differen data types in the same
# object
# we can use a list
myList
<
list
(
1
,
2
,
4
,
"a"
)
# named list
myNamedList
<
list
(
myFirstElem
=
1
,
mySecond
=
3
,
myCharElem
=
"a"
)
myNamedList
# $ sign notation can pull out named elements
myNamedList
$
mySecond
myNamedList
[
2
]
# single [] will always return a list (from a list)
mySub1
<
myNamedList
[
2
]
# double [[]] will return value at the position
myNamedList
[[
2
]]
# review of accessing elements in a vector
# accessing by name
namedVector
<
c
(
Alice
=
5.5
,
Bob
=
6.4
,
Steve
=
5.9
)
namedVector
namedVector
[
"Alice"
]
# access by position
namedVector
[
1
]
# access using logicals
namedVector
[
c
(
TRUE
,
TRUE
,
FALSE
)]
namedVector
==
"Alice"
namedVector
==
5.5
namedVector
[
namedVector
==
5.5
]
# > >= != <=
myNamedList
$
myFirstElem
namedVector
[
"Alice"
]
#coersion
# change the data type of an object
as.numeric
(
"9"
)
as.numeric
(
"a"
)
library
(
tidyverse
)
load
(
url
(
"http://wwwhuber.embl.de/users/klaus/BasicR/bodyfat.rda"
))
bodyfatDF
<
bodyfat
head
(
bodyfat
)
str
(
bodyfat
)
bodyfat
as_tibble
(
bodyfat
)
# two ways to create a tibble
bodyfat
<
as_tibble
(
bodyfat
)
# coerce
tibble
(
bodyfatDF
)
# create new from data
head
(
bodyfat
)
bodyfat
# interact with this tibble
# filter() will let you pull out rows from a tibble
bodyfat
filter
(
.data
=
bodyfat
,
age
<
40
)
filter
(
.data
=
bodyfat
,
age
>
40
&
age
<
60
)
filter
(
bodyfat
,
age
<
40

age
>
60
)
# arrange rows in different orders
arrange
(
bodyfat
,
age
)
arrange
(
bodyfat
,
age
,
weight
)
# change the direction of the order
arrange
(
bodyfat
,
desc
(
age
),
weight
)
# select columns
select
(
bodyfat
,
age
,
weight
)
# select columns we don't want
select
(
bodyfat
,

age
,

weight
)
# another way to pull out a column is by name
bodyfat
$
age
# another using baseR to pull out data []
bodyfat
[
1
,
5
]
# create new data
mutate
(
bodyfat
,
weight_kg
=
weight
*
0.454
)
bodyfatWithKG
<
mutate
(
bodyfat
,
weight_kg
=
weight
*
0.454
)
oneCol
<
select
(
bodyfatWithKG
,
weight_kg
)
# chaining aka piping
# avoids creating intermediate objects
oneCol
<
mutate
(
bodyfat
,
weight_kg
=
weight
*
0.454
)
%>%
select
(
weight_kg
)
x
sort
(
x
)
head
(
x
,
n
=
2
)
x
%>%
sort
()
x
%>%
sort
(
decreasing
=
TRUE
)
x
%>%
sort
()
%>%
head
(
n
=
2
)
%>%
mean
()
# alternative to write this without pipes
mean
(
head
(
sort
(
x
),
n
=
2
))
# this is hard
# Challenge
# create a new column in the bodyfat tibble
# called height_m
# conversion : in to m = x*0.0254
# save that as bodyfat
# show just the two height columns for age > 40
# yellow sticky when you're done
bodyfat_converted
<
mutate
(
bodyfat
,
height_m
=
height
*
0.0254
)
bodyfat_converted
%>%
filter
(
age
>
40
)
%>%
select
(
height
,
height_m
)
# summarise
summarise
(
bodyfat
,
meanAge
=
mean
(
age
))
summarise
(
bodyfat
,
meanAge
=
mean
(
age
),
medianAge
=
median
(
age
))
# this only returns the colums that we calculated
bodyfat
%>%
mutate
(
olderThan40
=
age
>
40
)
%>%
group_by
(
olderThan40
)
%>%
summarise
(
meanAge
=
mean
(
age
),
meanWeight
=
mean
(
weight
))
