1 回答

TA貢獻1765條經驗 獲得超5個贊
df %>%
# The original data was loaded as factors, which have their uses, but
# converting those to characters will be simpler to work with here.
mutate_if(is.factor, as.character) %>%
# I'm replacing NA in Subj & Recip with an empty string, and trimming
# excess spaces from the start and end. One of the recipients is " "
# but I assume that's functionally the same as blank.
mutate_at(c("Subject", "Recipient"), ~if_else(is.na(.), "", stringr::str_trim(.))) %>%
filter(Subject != '') %>%
mutate(Date = as.POSIXct(Date, format = '%m/%d/%Y %H:%M:%OS')) %>%
mutate(cond = Edit & Folder %in% c('out', 'draft') & Message == '') %>%
mutate(segment = cumsum(!cond)) %>%
filter(cond) %>% # EDIT: Added to exclude rows matching cond
# Get summary stats for each segment
group_by(Subject, Recipient, Length, segment) %>%
summarize(Start = min(Date),
End = max(Date),
Duration = End - Start) %>%
# This counts the number of times that these columns don't match their
# predecessor. TRUE = 1, so we get 1 when anything changes.
# Look at ?lag for more on what those parameters mean.
mutate(new_group = (Subject != lag(Subject, 1, "")) *
(Recipient != lag(Recipient, 1, "")) *
(Length != lag(Length, 1, ""))) %>%
ungroup() %>%
mutate(group = LETTERS[cumsum(new_group)])
# A tibble: 3 x 9
Subject Recipient Length segment Start End Duration new_group group
<chr> <chr> <int> <int> <dttm> <dttm> <drtn> <int> <chr>
1 hey [email protected],[email protected] 80 0 2020-01-02 01:00:10 2020-01-02 01:00:30 20 secs 1 A
2 hey [email protected],[email protected] 80 3 2020-01-02 01:02:00 2020-01-02 01:02:05 5 secs 0 A
3 hey [email protected],[email protected] 80 7 2020-01-02 01:03:00 2020-01-02 01:03:20 20 secs 0 A
添加回答
舉報