Materials from the lecture on tidy data. Much of this is copied from the course website, but not everything.

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(readr)
library(readxl)
library(haven)
library(janitor)

gather

First we’ll use gather to go from wide format to long format.

Load the pulse data and remind ourselves of the structure; this shows that the attributes across variables are different.

pulse_data = read_sas("./data/public_pulse_data.sas7bdat") %>%
  clean_names()

pulse_data %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame':    1087 obs. of  7 variables:
##  $ id          : num  10003 10015 10022 10026 10035 ...
##  $ age         : atomic  48 72.5 58.5 72.7 60.4 ...
##   ..- attr(*, "label")= chr "Age at baseline"
##   ..- attr(*, "format.sas")= chr "AGEA"
##  $ sex         : chr  "male" "male" "male" "male" ...
##  $ bdiscore_bl : atomic  7 6 14 20 4 2 4 5 0 10 ...
##   ..- attr(*, "label")= chr "BDI, BL IMPUTED SCORE"
##   ..- attr(*, "format.sas")= chr "F"
##  $ bdiscore_01m: atomic  1 NA 3 6 0 10 0 NA 3 2 ...
##   ..- attr(*, "label")= chr "BDI, 1 MONTH IMPUTED SCORE"
##   ..- attr(*, "format.sas")= chr "F"
##  $ bdiscore_06m: atomic  2 NA 8 18 1 12 NA 0 4 11 ...
##   ..- attr(*, "label")= chr "BDI, 6 MONTH IMPUTED SCORE"
##   ..- attr(*, "format.sas")= chr "F"
##  $ bdiscore_12m: atomic  0 NA NA 16 2 8 NA 2 0 6 ...
##   ..- attr(*, "label")= chr "BDI, 12 MONTH IMPUTED SCORE"
##   ..- attr(*, "format.sas")= chr "F"

Tidy the data (i.e. wide to long); then use separate to split bdiscore from bl, 01m, etc; then turn the final visit variable into a factor.

pulse_tidy = gather(pulse_data, key = "visit", value = "bdi_score", bdiscore_bl:bdiscore_12m) 
## Warning: attributes are not identical across measure variables;
## they will be dropped
separate(pulse_tidy, col = visit, into = c("remove", "visit"), sep = "_") %>%
  select(-remove) %>%
  pull(visit) %>%
  unique()
## [1] "bl"  "01m" "06m" "12m"
separate(pulse_tidy, col = visit, into = c("remove", "visit"), sep = "_") %>%
  select(-remove) %>% 
  mutate(visit = replace(visit, visit == "bl", "00m"),
         visit = factor(visit, levels = paste0(c("00", "01", "06", "12"), "m")))
## # A tibble: 4,348 x 5
##       id      age    sex  visit bdi_score
##    <dbl>    <dbl>  <chr> <fctr>     <dbl>
##  1 10003 48.03836   male    00m         7
##  2 10015 72.47123   male    00m         6
##  3 10022 58.50959   male    00m        14
##  4 10026 72.72877   male    00m        20
##  5 10035 60.40274   male    00m         4
##  6 10050 84.73833   male    00m         2
##  7 10078 31.34521   male    00m         4
##  8 10088 56.85205   male    00m         5
##  9 10091 75.99178   male    00m         0
## 10 10092 74.23401 female    00m        10
## # ... with 4,338 more rows

Full data cleaning chain for the pulse data.

pulse_data = read_sas("./data/public_pulse_data.sas7bdat") %>%
  clean_names() %>%
  gather(key = visit, value = bdi, bdiscore_bl:bdiscore_12m) %>%
  separate(visit, into = c("remove", "visit"), sep = "_") %>%
  select(id, visit, everything(), -remove) %>%
  mutate(visit = replace(visit, visit == "bl", "00m"),
         visit = factor(visit, levels = paste0(c("00", "01", "06", "12"), "m"))) %>%
  arrange(id, visit)
## Warning: attributes are not identical across measure variables;
## they will be dropped

As a second example of gathering, we’ll revisit the litters data.

read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>% 
  clean_names() %>%
  separate(group, c("tx_group", "day_of_tx"), 1) %>%
  select(tx_group, litter_number, ends_with("_weight")) %>%
  gather(key = "gd", value = "weight", gd0_weight:gd18_weight)
## # A tibble: 98 x 4
##    tx_group   litter_number         gd weight
##       <chr>           <chr>      <chr>  <dbl>
##  1        C             #85 gd0_weight   19.7
##  2        C       #1/2/95/2 gd0_weight   27.0
##  3        C   #5/5/3/83/3-3 gd0_weight   26.0
##  4        C     #5/4/2/95/2 gd0_weight   28.5
##  5        C     #4/2/95/3-3 gd0_weight     NA
##  6        C     #2/2/95/3-2 gd0_weight     NA
##  7        C #1/5/3/83/3-3/2 gd0_weight     NA
##  8        C       #3/83/3-3 gd0_weight     NA
##  9        C         #2/95/3 gd0_weight     NA
## 10        C     #3/5/2/2/95 gd0_weight   28.5
## # ... with 88 more rows

Binding rows!!

We need code that will:

  • read the LotR data for each movie separately
  • clean the data tables
  • bind the results into a single data frame.

The chunk below does this, and is copied from the course website.

fellowship_ring = read_excel("./data/LotR_Words.xlsx", range = "B3:D6") %>%
  clean_names() %>%
  gather(key = sex, value = words, female:male) %>%
  mutate(race = tolower(race),
         movie = "Fellowship")

two_towers = read_excel("./data/LotR_Words.xlsx", range = "F3:H6") %>%
  clean_names() %>%
  gather(key = sex, value = words, female:male) %>%
  mutate(race = tolower(race),
         movie = "Two Towers")

return_king = read_excel("./data/LotR_Words.xlsx", range = "J3:L6") %>%
  clean_names() %>%
  gather(key = sex, value = words, female:male) %>%
  mutate(race = tolower(race),
         movie = "Return")

lotr_tidy = bind_rows(fellowship_ring, two_towers, return_king) %>%
  select(movie, everything()) 

Yay joining!!

Lastly we’re going to join the pup and litter datasets. We’ll load and clean each dataset separately, and then join them using litter_number.

pup_data = read_csv("./data/FAS_pups.csv", col_types = "ciiiii") %>%
  clean_names() %>%
  mutate(sex = recode(sex, `1` = "male", `2` = "female")) 

litter_data = read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
  clean_names() %>%
  select(-pups_survive) %>%
  mutate(
    wt_gain = gd18_weight - gd0_weight,
    group = tolower(group))

fas_data = left_join(pup_data, litter_data)
## Joining, by = "litter_number"