# load packagesif(!require(pacman))install.packages("pacman")pacman::p_load(countdown, tidyverse, glue, scales, ggthemes, gt, palmerpenguins, openintro, ggrepel)# set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)
Missing values I
Is it ok to suppress the following warning? Or should you update your code to eliminate it?
# A tibble: 10 × 2
name profession
<chr> <chr>
1 Ada Lovelace Mathematician
2 Marie Curie Physicist and Chemist
3 Janaki Ammal Botanist
4 Chien-Shiung Wu Physicist
5 Katherine Johnson Mathematician
6 Rosalind Franklin Chemist
7 Vera Rubin Astronomer
8 Gladys West Mathematician
9 Flossie Wong-Staal Virologist and Molecular Biologist
10 Jennifer Doudna Biochemist
dates
# A tibble: 8 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Janaki Ammal 1897 1984
2 Chien-Shiung Wu 1912 1997
3 Katherine Johnson 1918 2020
4 Rosalind Franklin 1920 1958
5 Vera Rubin 1928 2016
6 Gladys West 1930 NA
7 Flossie Wong-Staal 1947 NA
8 Jennifer Doudna 1964 NA
works
# A tibble: 9 × 2
name known_for
<chr> <chr>
1 Ada Lovelace first computer algorithm
2 Marie Curie theory of radioactivity, first woman Nobel…
3 Janaki Ammal hybrid species, biodiversity protection
4 Chien-Shiung Wu experiment overturning theory of parity
5 Katherine Johnson orbital mechanics critical to sending first…
6 Vera Rubin existence of dark matter
7 Gladys West mathematical modeling of the shape of the E…
8 Flossie Wong-Staal first to clone HIV and map its genes, which…
9 Jennifer Doudna one of the primary developers of CRISPR
Desired output
# A tibble: 10 × 5
name profession birth_year death_year known_for
<chr> <chr> <dbl> <dbl> <chr>
1 Ada Lovelace Mathematic… NA NA first co…
2 Marie Curie Physicist … NA NA theory o…
3 Janaki Ammal Botanist 1897 1984 hybrid s…
4 Chien-Shiung Wu Physicist 1912 1997 experime…
5 Katherine Johnson Mathematic… 1918 2020 orbital …
6 Rosalind Franklin Chemist 1920 1958 <NA>
7 Vera Rubin Astronomer 1928 2016 existenc…
8 Gladys West Mathematic… 1930 NA mathemat…
9 Flossie Wong-Staal Virologist… 1947 NA first to…
10 Jennifer Doudna Biochemist 1964 NA one of t…
Inputs, reminder
names(professions)
[1] "name" "profession"
names(dates)
[1] "name" "birth_year" "death_year"
names(works)
[1] "name" "known_for"
nrow(professions)
[1] 10
nrow(dates)
[1] 8
nrow(works)
[1] 9
Joining data frames
something_join(x, y)
left_join(): all rows from x
right_join(): all rows from y
full_join(): all rows from both x and y
semi_join(): all rows from x where there are matching values in y, keeping just columns from x
inner_join(): all rows from x where there are matching values in y, return all combination of multiple matches in the case of multiple matches
anti_join(): return all rows from x where there are not matching values in y, never duplicate rows of x
…
Setup
For the next few slides…
x <-tibble(id =c(1, 2, 3),value_x =c("x1", "x2", "x3") )x
# A tibble: 3 × 2
id value_x
<dbl> <chr>
1 1 x1
2 2 x2
3 3 x3
y <-tibble(id =c(1, 2, 4),value_y =c("y1", "y2", "y4") )y
# A tibble: 3 × 2
id value_y
<dbl> <chr>
1 1 y1
2 2 y2
3 4 y4
# A tibble: 10 × 4
name profession birth_year death_year
<chr> <chr> <dbl> <dbl>
1 Ada Lovelace Mathematician NA NA
2 Marie Curie Physicist and Chemist NA NA
3 Janaki Ammal Botanist 1897 1984
4 Chien-Shiung Wu Physicist 1912 1997
5 Katherine Johnson Mathematician 1918 2020
6 Rosalind Franklin Chemist 1920 1958
7 Vera Rubin Astronomer 1928 2016
8 Gladys West Mathematician 1930 NA
9 Flossie Wong-Staal Virologist and Molec… 1947 NA
10 Jennifer Doudna Biochemist 1964 NA
# A tibble: 10 × 4
name birth_year death_year known_for
<chr> <dbl> <dbl> <chr>
1 Janaki Ammal 1897 1984 hybrid species, biod…
2 Chien-Shiung Wu 1912 1997 experiment overturni…
3 Katherine Johnson 1918 2020 orbital mechanics cr…
4 Rosalind Franklin 1920 1958 <NA>
5 Vera Rubin 1928 2016 existence of dark ma…
6 Gladys West 1930 NA mathematical modelin…
7 Flossie Wong-Staal 1947 NA first to clone HIV a…
8 Jennifer Doudna 1964 NA one of the primary d…
9 Ada Lovelace NA NA first computer algor…
10 Marie Curie NA NA theory of radioactiv…
inner_join()
inner_join(x, y)
Joining with `by = join_by(id)`
# A tibble: 2 × 3
id value_x value_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
inner_join()
dates |>inner_join(works)
Joining with `by = join_by(name)`
# A tibble: 7 × 4
name birth_year death_year known_for
<chr> <dbl> <dbl> <chr>
1 Janaki Ammal 1897 1984 hybrid species, biodi…
2 Chien-Shiung Wu 1912 1997 experiment overturnin…
3 Katherine Johnson 1918 2020 orbital mechanics cri…
4 Vera Rubin 1928 2016 existence of dark mat…
5 Gladys West 1930 NA mathematical modeling…
6 Flossie Wong-Staal 1947 NA first to clone HIV an…
7 Jennifer Doudna 1964 NA one of the primary de…
semi_join()
semi_join(x, y)
Joining with `by = join_by(id)`
# A tibble: 2 × 2
id value_x
<dbl> <chr>
1 1 x1
2 2 x2
semi_join()
dates |>semi_join(works)
Joining with `by = join_by(name)`
# A tibble: 7 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Janaki Ammal 1897 1984
2 Chien-Shiung Wu 1912 1997
3 Katherine Johnson 1918 2020
4 Vera Rubin 1928 2016
5 Gladys West 1930 NA
6 Flossie Wong-Staal 1947 NA
7 Jennifer Doudna 1964 NA
anti_join()
anti_join(x, y)
Joining with `by = join_by(id)`
# A tibble: 1 × 2
id value_x
<dbl> <chr>
1 3 x3
anti_join()
dates |>anti_join(works)
Joining with `by = join_by(name)`
# A tibble: 1 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Rosalind Franklin 1920 1958
Joining with `by = join_by(name)`
Joining with `by = join_by(name)`
scientists
# A tibble: 10 × 5
name profession birth_year death_year known_for
<chr> <chr> <dbl> <dbl> <chr>
1 Ada Lovelace Mathematic… NA NA first co…
2 Marie Curie Physicist … NA NA theory o…
3 Janaki Ammal Botanist 1897 1984 hybrid s…
4 Chien-Shiung Wu Physicist 1912 1997 experime…
5 Katherine Johnson Mathematic… 1918 2020 orbital …
6 Rosalind Franklin Chemist 1920 1958 <NA>
7 Vera Rubin Astronomer 1928 2016 existenc…
8 Gladys West Mathematic… 1930 NA mathemat…
9 Flossie Wong-Staal Virologist… 1947 NA first to…
10 Jennifer Doudna Biochemist 1964 NA one of t…
*_join() functions
From dplyr
Incredibly useful for bringing datasets with common information (e.g., unique identifier) together
Use by argument when the names of the column containing the common information are not the same across datasets
Always check that the numbers of rows and columns of the result dataset makes sense