Reshaping a data frame and creating lag variables using tidyr and dplyr



library(tidyverse)


ae <- read.csv("ae_data_2.csv")

head(ae)
##   Arrival.Month Arrival.Day Arrival.Hour arrival_date_dt Mins.on.AE Over.4.hrs
## 1        Jun-12         Fri           20       01-Jun-12        108          0
## 2        Jun-12         Fri           12       01-Jun-12         49          0
## 3        Jun-12         Fri           18       01-Jun-12        167          0
## 4        Jun-12         Fri           13       01-Jun-12        140          0
## 5        Jun-12         Fri           12       01-Jun-12        233          0
## 6        Jun-12         Fri           23       01-Jun-12        233          0
##   Triage_Main_Code Triage_Colour
## 1                4         GREEN
## 2                4         GREEN
## 3                4         GREEN
## 4                3        YELLOW
## 5                3        YELLOW
## 6                4         GREEN


Summarise the data by day and triage colour


ae_summary <- ae %>%
  group_by(arrival_date_dt, Arrival.Day, Triage_Colour) %>% 
  summarise(no_of_arrivals = n()) %>% 
  arrange(arrival_date_dt) %>%
  ungroup()

head(ae_summary)
## # A tibble: 6 x 4
##   arrival_date_dt Arrival.Day Triage_Colour no_of_arrivals
##   <chr>           <chr>       <chr>                  <int>
## 1 01-Jun-12       Fri         BLUE                       2
## 2 01-Jun-12       Fri         GREEN                    104
## 3 01-Jun-12       Fri         ORANGE                    20
## 4 01-Jun-12       Fri         YELLOW                    84
## 5 02-Jun-12       Sat         BLUE                       1
## 6 02-Jun-12       Sat         GREEN                    109


Reshaping the data using tidyr



The tidyr package has the pivot_wider() and pivot_longer() functions (used to be gather() and spread())


To transform the data into wide format and have a separate column for each triage colour (so only one row per day) use pivot_wider()


ae_summary %>%
  pivot_wider(names_from = Triage_Colour, 
              values_from = no_of_arrivals)
## # A tibble: 5 x 6
##   arrival_date_dt Arrival.Day  BLUE GREEN ORANGE YELLOW
##   <chr>           <chr>       <int> <int>  <int>  <int>
## 1 01-Jun-12       Fri             2   104     20     84
## 2 02-Jun-12       Sat             1   109     21     96
## 3 03-Jun-12       Sun             1    92     19     86
## 4 04-Jun-12       Mon            NA   102     19     87
## 5 05-Jun-12       Tues            1   100     19     83


The values_fill argument will replace the na with a value


ae_summary %>%
  pivot_wider(names_from = Triage_Colour, 
              values_from = no_of_arrivals, 
              values_fill = 0) 
## # A tibble: 5 x 6
##   arrival_date_dt Arrival.Day  BLUE GREEN ORANGE YELLOW
##   <chr>           <chr>       <int> <int>  <int>  <int>
## 1 01-Jun-12       Fri             2   104     20     84
## 2 02-Jun-12       Sat             1   109     21     96
## 3 03-Jun-12       Sun             1    92     19     86
## 4 04-Jun-12       Mon             0   102     19     87
## 5 05-Jun-12       Tues            1   100     19     83


You can also add a prefix the name of the new columns using the names_prefix argument

ae_summary_wide <- ae_summary %>%
  pivot_wider(names_from = Triage_Colour, 
              values_from = no_of_arrivals, 
              values_fill = 0,
              names_prefix = "no_of_arrivals_") # adds prefix to column names 
## # A tibble: 5 x 6
##   arrival_date_dt Arrival.Day no_of_arrivals_… no_of_arrivals_… no_of_arrivals_…
##   <chr>           <chr>                  <int>            <int>            <int>
## 1 01-Jun-12       Fri                        2              104               20
## 2 02-Jun-12       Sat                        1              109               21
## 3 03-Jun-12       Sun                        1               92               19
## 4 04-Jun-12       Mon                        0              102               19
## 5 05-Jun-12       Tues                       1              100               19
## # … with 1 more variable: no_of_arrivals_YELLOW <int>


To change the data back into long format use pivot_longer()

ae_summary_wide %>%
  pivot_longer(cols = starts_with("no_of_arrivals_"), # can also select columns by name or index
                names_to = "triage_colour",
               values_to = "no_of_arrivals") 
## # A tibble: 20 x 4
##    arrival_date_dt Arrival.Day triage_colour         no_of_arrivals
##    <chr>           <chr>       <chr>                          <int>
##  1 01-Jun-12       Fri         no_of_arrivals_BLUE                2
##  2 01-Jun-12       Fri         no_of_arrivals_GREEN             104
##  3 01-Jun-12       Fri         no_of_arrivals_ORANGE             20
##  4 01-Jun-12       Fri         no_of_arrivals_YELLOW             84
##  5 02-Jun-12       Sat         no_of_arrivals_BLUE                1
##  6 02-Jun-12       Sat         no_of_arrivals_GREEN             109
##  7 02-Jun-12       Sat         no_of_arrivals_ORANGE             21
##  8 02-Jun-12       Sat         no_of_arrivals_YELLOW             96
##  9 03-Jun-12       Sun         no_of_arrivals_BLUE                1
## 10 03-Jun-12       Sun         no_of_arrivals_GREEN              92
## 11 03-Jun-12       Sun         no_of_arrivals_ORANGE             19
## 12 03-Jun-12       Sun         no_of_arrivals_YELLOW             86
## 13 04-Jun-12       Mon         no_of_arrivals_BLUE                0
## 14 04-Jun-12       Mon         no_of_arrivals_GREEN             102
## 15 04-Jun-12       Mon         no_of_arrivals_ORANGE             19
## 16 04-Jun-12       Mon         no_of_arrivals_YELLOW             87
## 17 05-Jun-12       Tues        no_of_arrivals_BLUE                1
## 18 05-Jun-12       Tues        no_of_arrivals_GREEN             100
## 19 05-Jun-12       Tues        no_of_arrivals_ORANGE             19
## 20 05-Jun-12       Tues        no_of_arrivals_YELLOW             83


Add the names_prefix argument again to shorten the triage colour


ae_summary_wide %>%
  pivot_longer(cols = starts_with("no_of_arrivals_"), 
                names_to = "triage_colour",
               values_to = "no_of_arrivals",
               names_prefix = "no_of_arrivals_") # drops the prefix 
## # A tibble: 20 x 4
##    arrival_date_dt Arrival.Day triage_colour no_of_arrivals
##    <chr>           <chr>       <chr>                  <int>
##  1 01-Jun-12       Fri         BLUE                       2
##  2 01-Jun-12       Fri         GREEN                    104
##  3 01-Jun-12       Fri         ORANGE                    20
##  4 01-Jun-12       Fri         YELLOW                    84
##  5 02-Jun-12       Sat         BLUE                       1
##  6 02-Jun-12       Sat         GREEN                    109
##  7 02-Jun-12       Sat         ORANGE                    21
##  8 02-Jun-12       Sat         YELLOW                    96
##  9 03-Jun-12       Sun         BLUE                       1
## 10 03-Jun-12       Sun         GREEN                     92
## 11 03-Jun-12       Sun         ORANGE                    19
## 12 03-Jun-12       Sun         YELLOW                    86
## 13 04-Jun-12       Mon         BLUE                       0
## 14 04-Jun-12       Mon         GREEN                    102
## 15 04-Jun-12       Mon         ORANGE                    19
## 16 04-Jun-12       Mon         YELLOW                    87
## 17 05-Jun-12       Tues        BLUE                       1
## 18 05-Jun-12       Tues        GREEN                    100
## 19 05-Jun-12       Tues        ORANGE                    19
## 20 05-Jun-12       Tues        YELLOW                    83



Creating lagged variables with dplyr


First, I'll summarise the data by arrival date and hour then create a lagged variable for attendances in the previous hour


ae_summary_hr <- ae %>%
  group_by(Arrival.Month, Arrival.Day, arrival_date_dt, Arrival.Hour) %>%
  summarise(no_of_arrivals = n()) %>%
  arrange(arrival_date_dt) %>%
  ungroup()

head(ae_summary_hr)
## # A tibble: 6 x 5
##   Arrival.Month Arrival.Day arrival_date_dt Arrival.Hour no_of_arrivals
##   <chr>         <chr>       <chr>                  <int>          <int>
## 1 Jun-12        Fri         01-Jun-12                  0              4
## 2 Jun-12        Fri         01-Jun-12                  1              5
## 3 Jun-12        Fri         01-Jun-12                  2              4
## 4 Jun-12        Fri         01-Jun-12                  3              1
## 5 Jun-12        Fri         01-Jun-12                  4              2
## 6 Jun-12        Fri         01-Jun-12                  6              2


Then use the lag() function to create a new variable of patient arrivals in the previous hour


ae_summary_hr <- ae_summary_hr %>%
  mutate(prev_hr_arrivals = lag(no_of_arrivals))

head(ae_summary_hr[, 2:6], n = 10)
## # A tibble: 10 x 5
##    Arrival.Day arrival_date_dt Arrival.Hour no_of_arrivals prev_hr_arrivals
##    <chr>       <chr>                  <int>          <int>            <int>
##  1 Fri         01-Jun-12                  0              4               NA
##  2 Fri         01-Jun-12                  1              5                4
##  3 Fri         01-Jun-12                  2              4                5
##  4 Fri         01-Jun-12                  3              1                4
##  5 Fri         01-Jun-12                  4              2                1
##  6 Fri         01-Jun-12                  6              2                2
##  7 Fri         01-Jun-12                  7              5                2
##  8 Fri         01-Jun-12                  8              8                5
##  9 Fri         01-Jun-12                  9             10                8
## 10 Fri         01-Jun-12                 10             11               10


If you don't want the lag to be continuous you can use group_by() (ie if you didn't want it to rollover into the next day)


ae_summary_hr <- ae_summary_hr %>%
  group_by(arrival_date_dt) %>%
  mutate(prev_hr_arrivals_grp = lag(no_of_arrivals)) %>%
  ungroup()

head(ae_summary_hr[, 3:7], n = 10)
## # A tibble: 10 x 5
##    arrival_date_dt Arrival.Hour no_of_arrivals prev_hr_arrivals prev_hr_arrival…
##    <chr>                  <int>          <int>            <int>            <int>
##  1 01-Jun-12                  0              4               NA               NA
##  2 01-Jun-12                  1              5                4                4
##  3 01-Jun-12                  2              4                5                5
##  4 01-Jun-12                  3              1                4                4
##  5 01-Jun-12                  4              2                1                1
##  6 01-Jun-12                  6              2                2                2
##  7 01-Jun-12                  7              5                2                2
##  8 01-Jun-12                  8              8                5                5
##  9 01-Jun-12                  9             10                8                8
## 10 01-Jun-12                 10             11               10               10


If you want to lag by more than 1 hour then use the n = argument and the default = argument replaces any na's with a specified value


ae_summary_hr <- ae_summary_hr %>%
  mutate(prev_2_hr_arrivals = lag(no_of_arrivals, n = 2, default = 0))

head(ae_summary_hr[, 4:8], n = 10)
## # A tibble: 10 x 5
##    Arrival.Hour no_of_arrivals prev_hr_arrivals prev_hr_arrival…
##           <int>          <int>            <int>            <int>
##  1            0              4               NA               NA
##  2            1              5                4                4
##  3            2              4                5                5
##  4            3              1                4                4
##  5            4              2                1                1
##  6            6              2                2                2
##  7            7              5                2                2
##  8            8              8                5                5
##  9            9             10                8                8
## 10           10             11               10               10
## # … with 1 more variable: prev_2_hr_arrivals <dbl>


To do the opposite use the lead() function, which has the same arguments.