library(dplyr)
library(ggplot2)
library(prophet)
library(lubridate)
shopping <- read.csv('local-shopping_prophet.csv') %>%
rename(.,visitantes=Total.de.Visitantes) %>%
mutate(Dia = ymd(Dia))
glimpse(shopping)
Observations: 228
Variables: 3
$ Dia <date> 2017-09-14, 2017-09-15, 2017-09-16, 2017-09-17, 2017-09-18, 2017-09-19, 2017-09-20, 2017-09-21, 2017-09-22, 2017-09-23, 201...
$ visitantes <int> 589, 696, 1034, 940, 540, 526, 312, 553, 639, 793, 745, 8, 333, 568, 352, 393, 497, 578, 563, 520, 413, 8, 350, 765, 1071, 5...
$ Clima <fct> Parcialmente Nublado, Soleado, Soleado, Parcialmente Nublado, Soleado, Soleado, Soleado, Nublado, Soleado, Soleado, Parcialm...
ggplot(shopping, aes(Dia, visitantes)) + geom_line() + theme_bw() + labs(title='Visitantes por dia')
ggplot(shopping, aes(Dia, visitantes)) + geom_line() + geom_point(color='forestgreen') + theme_bw() + labs(title='Visitantes por dia')
ggplot(shopping, aes(Dia, visitantes)) + geom_point(color='forestgreen') + geom_smooth() + theme_bw() + labs(title='Visitantes por dia: Suavizado')
loess=stats::loess(visitantes~as.numeric(Dia), data = shopping, na.action = 'na.exclude', model = T, span=0.01)
span too small. fewer data values than degrees of freedom.pseudoinverse used at 17422neighborhood radius 2.135reciprocal condition number 0There are other near singularities as well. 4.5582k-d tree limited by memory. ncmax= 228
shopping['loess']=predict(loess,shopping)
loess_1 = ggplot(shopping, aes(Dia,visitantes)) + geom_point() + geom_line(aes(y=loess), color='firebrick', size=1) +
labs(title= "LOESS span:0.01") + theme_bw()
loess=stats::loess(visitantes~as.numeric(Dia), data = shopping, na.action = 'na.exclude', model = T, span=0.33)
shopping['loess']=predict(loess,shopping)
loess_2 = ggplot(shopping, aes(Dia,visitantes)) + geom_point() + geom_line(aes(y=loess), color='forestgreen', size=1) +
labs(title= "LOESS span:0.33") + theme_bw()
loess=stats::loess(visitantes~as.numeric(Dia), data = shopping, na.action = 'na.exclude', model = T, span=0.66)
shopping['loess']=predict(loess,shopping)
loess_3 = ggplot(shopping, aes(Dia,visitantes)) + geom_point() + geom_line(aes(y=loess), color='steelblue', size=1) +
labs(title= "LOESS span:0.66") + theme_bw()
loess=stats::loess(visitantes~as.numeric(Dia), data = shopping, na.action = 'na.exclude', model = T, span=0.99)
shopping['loess']=predict(loess,shopping)
loess_4 = ggplot(shopping, aes(Dia,visitantes)) + geom_point() + geom_line(aes(y=loess), color='purple', size=1) +
labs(title= "LOESS span:0.99") + theme_bw()
cowplot::plot_grid(loess_1, loess_2, loess_3,loess_4)
Prophet requiere que le pasemos el dataset con:
ds: la variable temporal
y: la variable a predecir
# Eliminamos observaciones con menos de 250 visitantes (cuestion de negocio)
shopping[shopping['visitantes']<250,'visitantes'] = NA
# Creamos el dataset
prophet_df = shopping %>% select(Dia, visitantes) %>% rename(., ds=Dia, y=visitantes)
Hay muchos parametros para tener en cuenta. Veremos algunos:
df: dataframe
growth: tipo de tendencia: lineal o logistica
yearly.seasonality: hay estacionalidad anual?
yearly.seasonality: hay estacionalidad diaria?
holidays: dataframe con fechas de vacaciones/eventos especiales
La funcion prophet
crea el modelo, podemos pasarle o no el dataframe.
La funcion prophet.fit
aplica un modelo creado a un dataframe
# Llamamos solo al modelo
prophet_base=prophet()
# Le pasamos el dataset
prophet_base = fit.prophet(m = prophet_base, prophet_df)
Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Initial log joint probability = -4.68957
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Fijense que el modelo automaticamente deshabilita la estacionalidad anual y diaria.
Llamando a plot
obtenemos el valor predicho del modelo y el valor original.
Es dificil de notar pero el modelo realiza predicciones aun para los dias en los cuales no hay datos.
plot(prophet_base,fcst=predict(prophet_base, prophet_df)) +theme_bw()
La funcion prophet_plot_components
nos devuelve los efectos de los componentes en nuestra variable a predecir
prophet_plot_components(prophet_base, fcst=predict(prophet_base, prophet_df)) +theme_bw()
NULL
La funcion cross_validation
permite realizar pronosticos realizando un esquema de cross-validation temporal y, a partir de ellos, obtener ciertas metricas de performance.
horizon: horizonte del pronostico
period: periodo entre fechas de analisis
initial: periodo inicial de entrenamiento
cv_base = cross_validation(prophet_base, initial = 45, period = 7, horizon = 15, units = 'days')
Making 24 forecasts with cutoffs between 2017-11-04 and 2018-04-14
Initial log joint probability = -3.06265
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -2.89179
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.42994
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -2.99569
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.58187
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.40797
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -6.14203
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -6.67306
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.38137
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.21596
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.43862
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.36873
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.43537
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.5117
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.60968
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.69348
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.77909
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.8066
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -4.02179
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -4.02246
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -3.99051
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -4.04036
Optimization terminated normally:
Convergence detected: relative gradient magnitude is below tolerance
Initial log joint probability = -4.27091
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Initial log joint probability = -4.33551
Optimization terminated normally:
Convergence detected: relative gradient magnitude is below tolerance
cv_base
La funcion performance_metrics
computa varias metricas de performance a partir de un dataframe de cross validation de prophet
performance_metrics(cv_base, rolling_window = 0.5)
La funcion add_seasonality
nos permite agregar nuevas estacionalidades. Definimos:
m: modelo
name: nombre de la estacionalidad
period: cantidad de dias del periodo
fourier.order: orden de la serie de fourier para modelar la estacionalidad
# Llamamos solo al modelo
prophet_mensual=prophet()
# Agregamos la estacionalidad mensual
prophet_mensual=add_seasonality(prophet_mensual, name='monthly', period=30.5, fourier.order = 4)
# Le pasamos el dataset
prophet_mensual = fit.prophet(m = prophet_mensual, prophet_df)
Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Initial log joint probability = -4.68957
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Fijense que el modelo automaticamente deshabilita la estacionalidad anual y diaria.
plot(prophet_mensual,fcst=predict(prophet_mensual, prophet_df)) +theme_bw()
La funcion prophet_plot_components
nos devuelve los efectos de los componentes en nuestra variable a predecir
prophet_plot_components(prophet_mensual, fcst=predict(prophet_mensual, prophet_df)) +theme_bw()
NULL
La funcion performance_metrics
computa varias metricas de performance a partir de un dataframe de cross validation de prophet
performance_metrics(cv_mensual, rolling_window = 0.5)
Como ultimo paso vamos a agregar las ventas de navidad y ciertos dias de promociones como eventos especiales
Creamos el dataframe de eventos con: nombre del evento, fechas y una “ventana” para definir si el evento se estira a ciertos dias.
christmas = data.frame(holiday= 'christmas',
ds=ymd(c('2017-12-16','2017-12-17','2017-12-18',
'2017-12-19','2017-12-20','2017-12-21',
'2017-12-22','2017-12-23')),
lower_window= 0,
upper_window= 0)
big_sales = data.frame(
holiday= 'big_sales',
ds= ymd(c('2017-09-16','2017-10-08','2017-10-14',
'2017-11-20','2017-12-03','2017-12-30')),
lower_window= 0,
upper_window= 0)
holidays= bind_rows(christmas, big_sales)
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
glimpse(holidays)
Observations: 14
Variables: 4
$ holiday <chr> "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "big_sales", "big_...
$ ds <date> 2017-12-16, 2017-12-17, 2017-12-18, 2017-12-19, 2017-12-20, 2017-12-21, 2017-12-22, 2017-12-23, 2017-09-16, 2017-10-08, 2...
$ lower_window <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ upper_window <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
# Llamamos al modelo con el dataset de eventos
prophet_full=prophet(holidays = holidays)
# Agregamos la estacionalidad mensual
prophet_full=add_seasonality(prophet_full, name='monthly', period=30.5, fourier.order = 4)
# Le pasamos el dataset
prophet_full = fit.prophet(m = prophet_full, prophet_df)
Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Initial log joint probability = -4.68957
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
Fijense que el modelo automaticamente deshabilita la estacionalidad anual y diaria.
plot(prophet_full,fcst=predict(prophet_full, prophet_df)) +theme_bw()
La funcion prophet_plot_components
nos devuelve los efectos de los componentes en nuestra variable a predecir
prophet_plot_components(prophet_full, fcst=predict(prophet_full, prophet_df)) +theme_bw()
NULL
performance_metrics(cv_full, rolling_window = 0.5)
dyplot.prophet(prophet_full, fcst=predict(prophet_full, prophet_df))