Materials from the lecture on tidy data. Much of this is copied from the course website, but not everything.
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readr)
library(readxl)
library(haven)
library(janitor)
First we’ll use gather to go from wide format to long format.
Load the pulse data and remind ourselves of the structure; this shows that the attributes across variables are different.
pulse_data = read_sas("./data/public_pulse_data.sas7bdat") %>%
clean_names()
pulse_data %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 1087 obs. of 7 variables:
## $ id : num 10003 10015 10022 10026 10035 ...
## $ age : atomic 48 72.5 58.5 72.7 60.4 ...
## ..- attr(*, "label")= chr "Age at baseline"
## ..- attr(*, "format.sas")= chr "AGEA"
## $ sex : chr "male" "male" "male" "male" ...
## $ bdiscore_bl : atomic 7 6 14 20 4 2 4 5 0 10 ...
## ..- attr(*, "label")= chr "BDI, BL IMPUTED SCORE"
## ..- attr(*, "format.sas")= chr "F"
## $ bdiscore_01m: atomic 1 NA 3 6 0 10 0 NA 3 2 ...
## ..- attr(*, "label")= chr "BDI, 1 MONTH IMPUTED SCORE"
## ..- attr(*, "format.sas")= chr "F"
## $ bdiscore_06m: atomic 2 NA 8 18 1 12 NA 0 4 11 ...
## ..- attr(*, "label")= chr "BDI, 6 MONTH IMPUTED SCORE"
## ..- attr(*, "format.sas")= chr "F"
## $ bdiscore_12m: atomic 0 NA NA 16 2 8 NA 2 0 6 ...
## ..- attr(*, "label")= chr "BDI, 12 MONTH IMPUTED SCORE"
## ..- attr(*, "format.sas")= chr "F"
Tidy the data (i.e. wide to long); then use separate to split bdiscore
from bl
, 01m
, etc; then turn the final visit
variable into a factor.
pulse_tidy = gather(pulse_data, key = "visit", value = "bdi_score", bdiscore_bl:bdiscore_12m)
## Warning: attributes are not identical across measure variables;
## they will be dropped
separate(pulse_tidy, col = visit, into = c("remove", "visit"), sep = "_") %>%
select(-remove) %>%
pull(visit) %>%
unique()
## [1] "bl" "01m" "06m" "12m"
separate(pulse_tidy, col = visit, into = c("remove", "visit"), sep = "_") %>%
select(-remove) %>%
mutate(visit = replace(visit, visit == "bl", "00m"),
visit = factor(visit, levels = paste0(c("00", "01", "06", "12"), "m")))
## # A tibble: 4,348 x 5
## id age sex visit bdi_score
## <dbl> <dbl> <chr> <fctr> <dbl>
## 1 10003 48.03836 male 00m 7
## 2 10015 72.47123 male 00m 6
## 3 10022 58.50959 male 00m 14
## 4 10026 72.72877 male 00m 20
## 5 10035 60.40274 male 00m 4
## 6 10050 84.73833 male 00m 2
## 7 10078 31.34521 male 00m 4
## 8 10088 56.85205 male 00m 5
## 9 10091 75.99178 male 00m 0
## 10 10092 74.23401 female 00m 10
## # ... with 4,338 more rows
Full data cleaning chain for the pulse data.
pulse_data = read_sas("./data/public_pulse_data.sas7bdat") %>%
clean_names() %>%
gather(key = visit, value = bdi, bdiscore_bl:bdiscore_12m) %>%
separate(visit, into = c("remove", "visit"), sep = "_") %>%
select(id, visit, everything(), -remove) %>%
mutate(visit = replace(visit, visit == "bl", "00m"),
visit = factor(visit, levels = paste0(c("00", "01", "06", "12"), "m"))) %>%
arrange(id, visit)
## Warning: attributes are not identical across measure variables;
## they will be dropped
As a second example of gathering, we’ll revisit the litters data.
read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
clean_names() %>%
separate(group, c("tx_group", "day_of_tx"), 1) %>%
select(tx_group, litter_number, ends_with("_weight")) %>%
gather(key = "gd", value = "weight", gd0_weight:gd18_weight)
## # A tibble: 98 x 4
## tx_group litter_number gd weight
## <chr> <chr> <chr> <dbl>
## 1 C #85 gd0_weight 19.7
## 2 C #1/2/95/2 gd0_weight 27.0
## 3 C #5/5/3/83/3-3 gd0_weight 26.0
## 4 C #5/4/2/95/2 gd0_weight 28.5
## 5 C #4/2/95/3-3 gd0_weight NA
## 6 C #2/2/95/3-2 gd0_weight NA
## 7 C #1/5/3/83/3-3/2 gd0_weight NA
## 8 C #3/83/3-3 gd0_weight NA
## 9 C #2/95/3 gd0_weight NA
## 10 C #3/5/2/2/95 gd0_weight 28.5
## # ... with 88 more rows
We need code that will:
The chunk below does this, and is copied from the course website.
fellowship_ring = read_excel("./data/LotR_Words.xlsx", range = "B3:D6") %>%
clean_names() %>%
gather(key = sex, value = words, female:male) %>%
mutate(race = tolower(race),
movie = "Fellowship")
two_towers = read_excel("./data/LotR_Words.xlsx", range = "F3:H6") %>%
clean_names() %>%
gather(key = sex, value = words, female:male) %>%
mutate(race = tolower(race),
movie = "Two Towers")
return_king = read_excel("./data/LotR_Words.xlsx", range = "J3:L6") %>%
clean_names() %>%
gather(key = sex, value = words, female:male) %>%
mutate(race = tolower(race),
movie = "Return")
lotr_tidy = bind_rows(fellowship_ring, two_towers, return_king) %>%
select(movie, everything())
Lastly we’re going to join the pup and litter datasets. We’ll load and clean each dataset separately, and then join them using litter_number
.
pup_data = read_csv("./data/FAS_pups.csv", col_types = "ciiiii") %>%
clean_names() %>%
mutate(sex = recode(sex, `1` = "male", `2` = "female"))
litter_data = read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
clean_names() %>%
select(-pups_survive) %>%
mutate(
wt_gain = gd18_weight - gd0_weight,
group = tolower(group))
fas_data = left_join(pup_data, litter_data)
## Joining, by = "litter_number"