When we get the Datetime of the events, we can generate some features for the Machine Learning Models. For example we can generate the:
- Year
- Month
- Weekday
- Hour
- Minute
- Week of the Year
- Quarter
Let’s see how we can generate these features in R from a datetime object. I would suggest converting to factors some features like weekdays, months, hours etc for Machine Learning purposes. Or even better to create more features like:
- A boolean
isWeekend
taking 1 for weekends and 0 otherwise - Period of Day like Morning, Afternoon, Evening
library(tidyverse) set.seed(5) df<- tibble(my_date = lubridate::as_datetime( runif(10, 1530000000, 1577739600))) df%>%mutate(Year = format(my_date, '%Y'), Month_Number = as.factor(format(my_date, '%m')), Weekday = as.factor(weekdays(my_date)), Hour =as.factor(format(my_date, '%H')), Minute =as.factor(format(my_date, '%M')), Week =(format(my_date, '%W')), Quarter = lubridate::quarter(my_date, with_year = T))
Output:
# A tibble: 10 x 8
my_date Year Month_Number Weekday Hour Minute Week Quarter
<dttm> <chr> <fct> <fct> <fct> <fct> <chr> <fct>
1 2018-10-14 23:02:37 2018 10 Sunday 23 02 41 4
2 2019-07-09 22:41:01 2019 07 Tuesday 22 41 27 3
3 2019-11-14 22:41:22 2019 11 Thursday 22 41 45 4
4 2018-11-30 11:25:16 2018 11 Friday 11 25 48 4
5 2018-08-23 03:45:55 2018 08 Thursday 03 45 34 3
6 2019-07-18 16:43:22 2019 07 Thursday 16 43 28 3
7 2019-04-14 01:16:38 2019 04 Sunday 01 16 14 2
8 2019-09-15 18:01:43 2019 09 Sunday 18 01 36 3
9 2019-12-06 20:08:53 2019 12 Friday 20 08 48 4
10 2018-08-26 08:43:02 2018 08 Sunday 08 43 34 3
Extra Part: How to Round Dates in R
Sometimes for our analysis we want to round the dates by year, month, even by minute, hour etc. Let’s see how we can do it in our data by taking the floor and the ceiling by month:
library(tidyverse) set.seed(5) df<- tibble(my_date = lubridate::as_datetime( runif(10, 1530000000, 1577739600))) df%>%mutate(Floor = lubridate::floor_date(my_date, 'month'), Ceiling = lubridate::ceiling_date(my_date, 'month'))
# A tibble: 10 x 3
my_date Floor Ceiling
<dttm> <dttm> <dttm>
1 2018-10-14 23:02:37 2018-10-01 00:00:00 2018-11-01 00:00:00
2 2019-07-09 22:41:01 2019-07-01 00:00:00 2019-08-01 00:00:00
3 2019-11-14 22:41:22 2019-11-01 00:00:00 2019-12-01 00:00:00
4 2018-11-30 11:25:16 2018-11-01 00:00:00 2018-12-01 00:00:00
5 2018-08-23 03:45:55 2018-08-01 00:00:00 2018-09-01 00:00:00
6 2019-07-18 16:43:22 2019-07-01 00:00:00 2019-08-01 00:00:00
7 2019-04-14 01:16:38 2019-04-01 00:00:00 2019-05-01 00:00:00
8 2019-09-15 18:01:43 2019-09-01 00:00:00 2019-10-01 00:00:00
9 2019-12-06 20:08:53 2019-12-01 00:00:00 2020-01-01 00:00:00
10 2018-08-26 08:43:02 2018-08-01 00:00:00 2018-09-01 00:00:00
You can find more info at the lubridate documentation as follows:
## print fractional seconds options(digits.secs=6) x <- ymd_hms("2009-08-03 12:01:59.23") round_date(x, ".5s") #> [1] "2009-08-03 12:01:59 UTC" round_date(x, "sec") #> [1] "2009-08-03 12:01:59 UTC" round_date(x, "second") #> [1] "2009-08-03 12:01:59 UTC" round_date(x, "minute") #> [1] "2009-08-03 12:02:00 UTC" round_date(x, "5 mins") #> [1] "2009-08-03 12:00:00 UTC" round_date(x, "hour") #> [1] "2009-08-03 12:00:00 UTC" round_date(x, "2 hours") #> [1] "2009-08-03 12:00:00 UTC" round_date(x, "day") #> [1] "2009-08-04 UTC" round_date(x, "week") #> [1] "2009-08-02 UTC" round_date(x, "month") #> [1] "2009-08-01 UTC" round_date(x, "bimonth") #> [1] "2009-09-01 UTC" round_date(x, "quarter") == round_date(x, "3 months") #> [1] TRUE round_date(x, "halfyear") #> [1] "2009-07-01 UTC" round_date(x, "year") #> [1] "2010-01-01 UTC" x <- ymd_hms("2009-08-03 12:01:59.23") floor_date(x, ".1s") #> [1] "2009-08-03 12:01:59.2 UTC" floor_date(x, "second") #> [1] "2009-08-03 12:01:59 UTC" floor_date(x, "minute") #> [1] "2009-08-03 12:01:00 UTC" floor_date(x, "hour") #> [1] "2009-08-03 12:00:00 UTC" floor_date(x, "day") #> [1] "2009-08-03 UTC" floor_date(x, "week") #> [1] "2009-08-02 UTC" floor_date(x, "month") #> [1] "2009-08-01 UTC" floor_date(x, "bimonth") #> [1] "2009-07-01 UTC" floor_date(x, "quarter") #> [1] "2009-07-01 UTC" floor_date(x, "season") #> [1] "2009-06-01 UTC" floor_date(x, "halfyear") #> [1] "2009-07-01 UTC" floor_date(x, "year") #> [1] "2009-01-01 UTC" x <- ymd_hms("2009-08-03 12:01:59.23") ceiling_date(x, ".1 sec") # imprecise representation at 0.1 sec !!! #> [1] "2009-08-03 12:01:59.2 UTC" ceiling_date(x, "second") #> [1] "2009-08-03 12:02:00 UTC" ceiling_date(x, "minute") #> [1] "2009-08-03 12:02:00 UTC" ceiling_date(x, "5 mins") #> [1] "2009-08-03 12:05:00 UTC" ceiling_date(x, "hour") #> [1] "2009-08-03 13:00:00 UTC" ceiling_date(x, "day") #> [1] "2009-08-04 UTC" ceiling_date(x, "week") #> [1] "2009-08-09 UTC" ceiling_date(x, "month") #> [1] "2009-09-01 UTC" ceiling_date(x, "bimonth") == ceiling_date(x, "2 months") #> [1] TRUE ceiling_date(x, "quarter") #> [1] "2009-10-01 UTC" ceiling_date(x, "season") #> [1] "2009-09-01 UTC" ceiling_date(x, "halfyear") #> [1] "2010-01-01 UTC" ceiling_date(x, "year") #> [1] "2010-01-01 UTC" ## As of R 3.4.2 POSIXct printing of fractional numbers is wrong as.POSIXct("2009-08-03 12:01:59.3") ## -> "2009-08-03 12:01:59.2 CEST" #> [1] "2009-08-03 12:01:59.2 UTC" ceiling_date(x, ".1 sec") ## -> "2009-08-03 12:01:59.2 CEST" #> [1] "2009-08-03 12:01:59.2 UTC" ## behaviour of `change_on_boundary` ## As per default behaviour `NULL`, instants on the boundary remain the ## same but dates are rounded up ceiling_date(ymd_hms("2000-01-01 00:00:00"), "month") #> [1] "2000-01-01 UTC" ceiling_date(ymd("2000-01-01"), "month") #> [1] "2000-02-01" ## If `TRUE`, both instants and dates on the boundary are rounded up ceiling_date(ymd_hms("2000-01-01 00:00:00"), "month", change_on_boundary = TRUE) #> [1] "2000-02-01 UTC" ceiling_date(ymd("2000-01-01"), "month") #> [1] "2000-02-01" ## If `FALSE`, both instants and dates on the boundary remain the same ceiling_date(ymd_hms("2000-01-01 00:00:00"), "month", change_on_boundary = FALSE) #> [1] "2000-01-01 UTC" ceiling_date(ymd("2000-01-01"), "month") #> [1] "2000-02-01" x <- ymd_hms("2000-01-01 00:00:00") ceiling_date(x, "month") #> [1] "2000-01-01 UTC" ceiling_date(x, "month", change_on_boundary = TRUE) #> [1] "2000-02-01 UTC" ## For Date objects first day of the month is not on the ## "boundary". change_on_boundary applies to instants only. x <- ymd("2000-01-01") ceiling_date(x, "month") #> [1] "2000-02-01" ceiling_date(x, "month", change_on_boundary = TRUE) #> [1] "2000-02-01"