tidyr
and dplyr
library(tidyverse)
ae <- read.csv("ae_data_2.csv")
head(ae)
## Arrival.Month Arrival.Day Arrival.Hour arrival_date_dt Mins.on.AE Over.4.hrs
## 1 Jun-12 Fri 20 01-Jun-12 108 0
## 2 Jun-12 Fri 12 01-Jun-12 49 0
## 3 Jun-12 Fri 18 01-Jun-12 167 0
## 4 Jun-12 Fri 13 01-Jun-12 140 0
## 5 Jun-12 Fri 12 01-Jun-12 233 0
## 6 Jun-12 Fri 23 01-Jun-12 233 0
## Triage_Main_Code Triage_Colour
## 1 4 GREEN
## 2 4 GREEN
## 3 4 GREEN
## 4 3 YELLOW
## 5 3 YELLOW
## 6 4 GREEN
ae_summary <- ae %>%
group_by(arrival_date_dt, Arrival.Day, Triage_Colour) %>%
summarise(no_of_arrivals = n()) %>%
arrange(arrival_date_dt) %>%
ungroup()
head(ae_summary)
## # A tibble: 6 x 4
## arrival_date_dt Arrival.Day Triage_Colour no_of_arrivals
## <chr> <chr> <chr> <int>
## 1 01-Jun-12 Fri BLUE 2
## 2 01-Jun-12 Fri GREEN 104
## 3 01-Jun-12 Fri ORANGE 20
## 4 01-Jun-12 Fri YELLOW 84
## 5 02-Jun-12 Sat BLUE 1
## 6 02-Jun-12 Sat GREEN 109
tidyr
The tidyr
package has the pivot_wider()
and pivot_longer()
functions (used to be gather()
and spread()
)
To transform the data into wide format and have a separate column for each triage colour (so only one row per day) use pivot_wider()
ae_summary %>%
pivot_wider(names_from = Triage_Colour,
values_from = no_of_arrivals)
## # A tibble: 5 x 6
## arrival_date_dt Arrival.Day BLUE GREEN ORANGE YELLOW
## <chr> <chr> <int> <int> <int> <int>
## 1 01-Jun-12 Fri 2 104 20 84
## 2 02-Jun-12 Sat 1 109 21 96
## 3 03-Jun-12 Sun 1 92 19 86
## 4 04-Jun-12 Mon NA 102 19 87
## 5 05-Jun-12 Tues 1 100 19 83
The values_fill
argument will replace the na with a value
ae_summary %>%
pivot_wider(names_from = Triage_Colour,
values_from = no_of_arrivals,
values_fill = 0)
## # A tibble: 5 x 6
## arrival_date_dt Arrival.Day BLUE GREEN ORANGE YELLOW
## <chr> <chr> <int> <int> <int> <int>
## 1 01-Jun-12 Fri 2 104 20 84
## 2 02-Jun-12 Sat 1 109 21 96
## 3 03-Jun-12 Sun 1 92 19 86
## 4 04-Jun-12 Mon 0 102 19 87
## 5 05-Jun-12 Tues 1 100 19 83
You can also add a prefix the name of the new columns using the names_prefix
argument
ae_summary_wide <- ae_summary %>%
pivot_wider(names_from = Triage_Colour,
values_from = no_of_arrivals,
values_fill = 0,
names_prefix = "no_of_arrivals_") # adds prefix to column names
## # A tibble: 5 x 6
## arrival_date_dt Arrival.Day no_of_arrivals_… no_of_arrivals_… no_of_arrivals_…
## <chr> <chr> <int> <int> <int>
## 1 01-Jun-12 Fri 2 104 20
## 2 02-Jun-12 Sat 1 109 21
## 3 03-Jun-12 Sun 1 92 19
## 4 04-Jun-12 Mon 0 102 19
## 5 05-Jun-12 Tues 1 100 19
## # … with 1 more variable: no_of_arrivals_YELLOW <int>
To change the data back into long format use pivot_longer()
ae_summary_wide %>%
pivot_longer(cols = starts_with("no_of_arrivals_"), # can also select columns by name or index
names_to = "triage_colour",
values_to = "no_of_arrivals")
## # A tibble: 20 x 4
## arrival_date_dt Arrival.Day triage_colour no_of_arrivals
## <chr> <chr> <chr> <int>
## 1 01-Jun-12 Fri no_of_arrivals_BLUE 2
## 2 01-Jun-12 Fri no_of_arrivals_GREEN 104
## 3 01-Jun-12 Fri no_of_arrivals_ORANGE 20
## 4 01-Jun-12 Fri no_of_arrivals_YELLOW 84
## 5 02-Jun-12 Sat no_of_arrivals_BLUE 1
## 6 02-Jun-12 Sat no_of_arrivals_GREEN 109
## 7 02-Jun-12 Sat no_of_arrivals_ORANGE 21
## 8 02-Jun-12 Sat no_of_arrivals_YELLOW 96
## 9 03-Jun-12 Sun no_of_arrivals_BLUE 1
## 10 03-Jun-12 Sun no_of_arrivals_GREEN 92
## 11 03-Jun-12 Sun no_of_arrivals_ORANGE 19
## 12 03-Jun-12 Sun no_of_arrivals_YELLOW 86
## 13 04-Jun-12 Mon no_of_arrivals_BLUE 0
## 14 04-Jun-12 Mon no_of_arrivals_GREEN 102
## 15 04-Jun-12 Mon no_of_arrivals_ORANGE 19
## 16 04-Jun-12 Mon no_of_arrivals_YELLOW 87
## 17 05-Jun-12 Tues no_of_arrivals_BLUE 1
## 18 05-Jun-12 Tues no_of_arrivals_GREEN 100
## 19 05-Jun-12 Tues no_of_arrivals_ORANGE 19
## 20 05-Jun-12 Tues no_of_arrivals_YELLOW 83
Add the names_prefix argument again to shorten the triage colour
ae_summary_wide %>%
pivot_longer(cols = starts_with("no_of_arrivals_"),
names_to = "triage_colour",
values_to = "no_of_arrivals",
names_prefix = "no_of_arrivals_") # drops the prefix
## # A tibble: 20 x 4
## arrival_date_dt Arrival.Day triage_colour no_of_arrivals
## <chr> <chr> <chr> <int>
## 1 01-Jun-12 Fri BLUE 2
## 2 01-Jun-12 Fri GREEN 104
## 3 01-Jun-12 Fri ORANGE 20
## 4 01-Jun-12 Fri YELLOW 84
## 5 02-Jun-12 Sat BLUE 1
## 6 02-Jun-12 Sat GREEN 109
## 7 02-Jun-12 Sat ORANGE 21
## 8 02-Jun-12 Sat YELLOW 96
## 9 03-Jun-12 Sun BLUE 1
## 10 03-Jun-12 Sun GREEN 92
## 11 03-Jun-12 Sun ORANGE 19
## 12 03-Jun-12 Sun YELLOW 86
## 13 04-Jun-12 Mon BLUE 0
## 14 04-Jun-12 Mon GREEN 102
## 15 04-Jun-12 Mon ORANGE 19
## 16 04-Jun-12 Mon YELLOW 87
## 17 05-Jun-12 Tues BLUE 1
## 18 05-Jun-12 Tues GREEN 100
## 19 05-Jun-12 Tues ORANGE 19
## 20 05-Jun-12 Tues YELLOW 83
dplyr
First, I'll summarise the data by arrival date and hour then create a lagged variable for attendances in the previous hour
ae_summary_hr <- ae %>%
group_by(Arrival.Month, Arrival.Day, arrival_date_dt, Arrival.Hour) %>%
summarise(no_of_arrivals = n()) %>%
arrange(arrival_date_dt) %>%
ungroup()
head(ae_summary_hr)
## # A tibble: 6 x 5
## Arrival.Month Arrival.Day arrival_date_dt Arrival.Hour no_of_arrivals
## <chr> <chr> <chr> <int> <int>
## 1 Jun-12 Fri 01-Jun-12 0 4
## 2 Jun-12 Fri 01-Jun-12 1 5
## 3 Jun-12 Fri 01-Jun-12 2 4
## 4 Jun-12 Fri 01-Jun-12 3 1
## 5 Jun-12 Fri 01-Jun-12 4 2
## 6 Jun-12 Fri 01-Jun-12 6 2
Then use the lag()
function to create a new variable of patient arrivals in the previous hour
ae_summary_hr <- ae_summary_hr %>%
mutate(prev_hr_arrivals = lag(no_of_arrivals))
head(ae_summary_hr[, 2:6], n = 10)
## # A tibble: 10 x 5
## Arrival.Day arrival_date_dt Arrival.Hour no_of_arrivals prev_hr_arrivals
## <chr> <chr> <int> <int> <int>
## 1 Fri 01-Jun-12 0 4 NA
## 2 Fri 01-Jun-12 1 5 4
## 3 Fri 01-Jun-12 2 4 5
## 4 Fri 01-Jun-12 3 1 4
## 5 Fri 01-Jun-12 4 2 1
## 6 Fri 01-Jun-12 6 2 2
## 7 Fri 01-Jun-12 7 5 2
## 8 Fri 01-Jun-12 8 8 5
## 9 Fri 01-Jun-12 9 10 8
## 10 Fri 01-Jun-12 10 11 10
If you don't want the lag to be continuous you can use group_by()
(ie if you didn't want it to rollover into the next day)
ae_summary_hr <- ae_summary_hr %>%
group_by(arrival_date_dt) %>%
mutate(prev_hr_arrivals_grp = lag(no_of_arrivals)) %>%
ungroup()
head(ae_summary_hr[, 3:7], n = 10)
## # A tibble: 10 x 5
## arrival_date_dt Arrival.Hour no_of_arrivals prev_hr_arrivals prev_hr_arrival…
## <chr> <int> <int> <int> <int>
## 1 01-Jun-12 0 4 NA NA
## 2 01-Jun-12 1 5 4 4
## 3 01-Jun-12 2 4 5 5
## 4 01-Jun-12 3 1 4 4
## 5 01-Jun-12 4 2 1 1
## 6 01-Jun-12 6 2 2 2
## 7 01-Jun-12 7 5 2 2
## 8 01-Jun-12 8 8 5 5
## 9 01-Jun-12 9 10 8 8
## 10 01-Jun-12 10 11 10 10
If you want to lag by more than 1 hour then use the n =
argument and the default =
argument replaces any na's with a specified value
ae_summary_hr <- ae_summary_hr %>%
mutate(prev_2_hr_arrivals = lag(no_of_arrivals, n = 2, default = 0))
head(ae_summary_hr[, 4:8], n = 10)
## # A tibble: 10 x 5
## Arrival.Hour no_of_arrivals prev_hr_arrivals prev_hr_arrival…
## <int> <int> <int> <int>
## 1 0 4 NA NA
## 2 1 5 4 4
## 3 2 4 5 5
## 4 3 1 4 4
## 5 4 2 1 1
## 6 6 2 2 2
## 7 7 5 2 2
## 8 8 8 5 5
## 9 9 10 8 8
## 10 10 11 10 10
## # … with 1 more variable: prev_2_hr_arrivals <dbl>
To do the opposite use the lead()
function, which has the same arguments.