library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))
如果时间序列数据没有通过白噪声检验,则说明该序列为随机数序列,则没有建立时间序列模型进行分析的必要。
用来判断时间序列是否为平稳序列
协整检验和Granger因果检验
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## 模拟一组随机数据和非随机数据进行白噪声检验
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
AirPas$Month <- as.yearmon(AirPas$Month)
set.seed(123) ## 生成一组随机数据
AirPas$randdata <- round(rnorm(nrow(AirPas),mean = mean(AirPas$Passengers),
sd=50))
head(AirPas)
## Month Passengers randdata
## 1 1 1949 112 252
## 2 2 1949 118 269
## 3 3 1949 132 358
## 4 4 1949 129 284
## 5 5 1949 121 287
## 6 6 1949 135 366
# ## 转化为时间序列数据
# Air_ts <- ts(AirPas$Passengers,start = c(1949,1),deltat = 1/12)
# plot.ts(Air_ts)
# ## 可视化两组时间序列数据
# Air_tsdf <- tsdf(Air_ts,colname = "month")
# head(Air_tsdf)
# set.seed(123)
# Air_tsdf$randdata <- round(rnorm(nrow(AirPas),mean = mean(AirPas$Passengers),
# sd=20))
# head(Air_tsdf)
## 可视化两组数据集
AirPas%>%gather(key = "dataclass",value = "Number",-Month)%>%
ggplot(aes(x=Month,y=Number))+
geom_line(aes(colour=dataclass,linetype = dataclass))+
theme(legend.position = c(0.15,0.8))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Ljung-Box 检验
Box.test(AirPas$Passengers,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: AirPas$Passengers
## X-squared = 132.14, df = 1, p-value < 2.2e-16
## p-value < 2.2e-16 说明该序列为非随机数据
Box.test(AirPas$randdata,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: AirPas$randdata
## X-squared = 0.13193, df = 1, p-value = 0.7164
## p-value = 0.6978 说明该序列为随机数据,即为白噪声
## 单位根检验,检验时间序列的平稳性
## ADF检验的零假设为有单位根
## 如果一个时间序列是平稳的,则通常只有随机成分,常用ARMA模型来预测未来的取值,
## 如果将一组数据经过d次差分后可以将不平稳的序列准化为平稳的序列,则称序列为d阶单整。
## 生成ARIMA(2,2,2)的时间序列数据,进行单位根检验演示
adfdata <- arima.sim(list(order = c(2,2,2),ar = c(0.8897, -0.4858),
d=2,ma = c(-0.2279, 0.2488)),n = 200)
diff1 <- diff(adfdata)
diff2 <- diff(diff1)
diff3 <- diff(diff2)
## 可视化4种曲线
par(mfrow=c(2,2),family = "STKaiti")
plot(adfdata,main="ARIMA(2,2,2)")
plot(diff1,main="差分1次")
plot(diff2,main="差分2次")
plot(diff3,main="差分3次")
## 进行单位根检验
adf.test(adfdata)
##
## Augmented Dickey-Fuller Test
##
## data: adfdata
## Dickey-Fuller = -0.38412, Lag order = 5, p-value = 0.9862
## alternative hypothesis: stationary
## p-value = 0.9684 说明数据不是平稳的
adf.test(diff1)
##
## Augmented Dickey-Fuller Test
##
## data: diff1
## Dickey-Fuller = -1.959, Lag order = 5, p-value = 0.5933
## alternative hypothesis: stationary
## p-value = 0.6465 说明一阶差分后数据不是平稳的
adf.test(diff2)
## Warning in adf.test(diff2): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff2
## Dickey-Fuller = -6.8816, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
Box.test(diff2,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: diff2
## X-squared = 62.357, df = 1, p-value = 2.887e-15
## p-value = 0.01, 说明2阶差分后数据是平稳的,而且不是白噪声数据
## 可以对原始数据建立ARIMA(p,2,q)模型
Box.test(diff3,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: diff3
## X-squared = 0.36924, df = 1, p-value = 0.5434
## 3阶差分后的数据已经是白噪声数据,没有分析的价值
如果一个时间序列数据是平稳的,那么可以使用ARMA模型来预测未来的数据
library(ggfortify)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(forecast)
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fitted.fracdiff fracdiff
## fortify.ts ggfortify
## residuals.ar ggfortify
## residuals.fracdiff fracdiff
# library(TSA)
## 读取数据
ARMAdata <- read.csv("data/chap7/ARMAdata.csv")
ARMAdata <- ts(ARMAdata$x)
plot.ts(ARMAdata)
autoplot(ARMAdata)+ggtitle("序列变化趋势")
## 白噪声检验
Box.test(ARMAdata,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: ARMAdata
## X-squared = 62.357, df = 1, p-value = 2.887e-15
## p-value = 4.552e-15 ,说明不是白噪声
## 平稳性检验,单位根检验
adf.test(ARMAdata)
## Warning in adf.test(ARMAdata): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ARMAdata
## Dickey-Fuller = -6.8816, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
## p-value = 0.01,说明数据是平稳的
## 分析序列的自相关系数和偏自相关系数确定参数p和q
p1 <- autoplot(acf(ARMAdata,lag.max = 30,plot = F))+
ggtitle("序列自相关图")
p2 <- autoplot(pacf(ARMAdata,lag.max = 30,plot = F))+
ggtitle("序列偏自相关图")
gridExtra::grid.arrange(p1,p2,nrow=2)
## 偏自相关图3阶后截尾,可以认为p的取值为3左右,
## 自相关图5阶后截尾,可以认为q的取值为5左右,
## 通过观察自相关系数和偏自相关系数虽然可以确定p和q,但是这不是最好的方法,
## R提供了自动寻找序列合适的参数的函数
auto.arima(ARMAdata)
## Series: ARMAdata
## ARIMA(3,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3
## 0.6641 -0.1544 -0.2454
## s.e. 0.0684 0.0823 0.0685
##
## sigma^2 estimated as 0.9333: log likelihood=-275.78
## AIC=559.56 AICc=559.76 BIC=572.75
## 可以发现较好的ARMA模型为ARMA(2,1)
## 对数据建立ARMA(2,1)模型,并预测后面的数据
ARMAmod <- arima(ARMAdata,order = c(2,0,1))
summary(ARMAmod)
##
## Call:
## arima(x = ARMAdata, order = c(2, 0, 1))
##
## Coefficients:
## ar1 ar2 ma1 intercept
## 1.2059 -0.6074 -0.5455 0.1003
## s.e. 0.1030 0.0637 0.1217 0.0772
##
## sigma^2 estimated as 0.9168: log likelihood = -275.5, aic = 561.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001170727 0.9574957 0.7646117 -38.47298 349.0644 0.8157659
## ACF1
## Training set -0.01911556
## 对拟合残差进行白噪声检验
Box.test(ARMAmod$residuals,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: ARMAmod$residuals
## X-squared = 0.074183, df = 1, p-value = 0.7853
## p-value = 0.7853 ,说明是白噪声
## 可视化模型未来的预测值
par(family = "STKaiti")
plot(forecast(ARMAmod,h=20))
## 对飞机乘客数据建立季节趋势的ARIMA模型进行预测未来的数据
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
## 处理为时间序列数据
AirPas$Month <- as.yearmon(AirPas$Month)
AirPas <- ts(AirPas$Passengers,start = AirPas$Month[1],frequency = 12)
head(AirPas)
## Jan Feb Mar Apr May Jun
## 1949 112 118 132 129 121 135
## 可视化序列
autoplot(AirPas)+ggtitle("飞机乘客数量变化趋势")
## 将数据即切分位两个部分,一部分用于训练模型,一部分用于查看预测效果
AirPas_train <- window(AirPas,end=c(1958,12))
AirPas_test <- window(AirPas,star=c(1959,1))
adf.test(AirPas_train,k=12)
##
## Augmented Dickey-Fuller Test
##
## data: AirPas_train
## Dickey-Fuller = -1.8449, Lag order = 12, p-value = 0.641
## alternative hypothesis: stationary
adf.test(diff(AirPas_train),k=12)
##
## Augmented Dickey-Fuller Test
##
## data: diff(AirPas_train)
## Dickey-Fuller = -1.9293, Lag order = 12, p-value = 0.6059
## alternative hypothesis: stationary
adf.test(diff(diff(AirPas_train)),k=12)
## Warning in adf.test(diff(diff(AirPas_train)), k = 12): p-value smaller than
## printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(diff(AirPas_train))
## Dickey-Fuller = -7.6169, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 说明数据延迟12阶,原始数据和差分一次数据都有单位根,而差分两次后数据是平稳的
AirPasdiff2 <- diff(diff(AirPas_train))
## 分析序列的自相关系数和偏自相关系数分析参数p和q
p1 <- autoplot(acf(AirPasdiff2,lag.max = 40,plot = F))+
ggtitle("序列自相关图")
p2 <- autoplot(pacf(AirPasdiff2,lag.max = 40,plot = F))+
ggtitle("序列偏自相关图")
gridExtra::grid.arrange(p1,p2,nrow=2)
## 从自相关图和偏自相关图可以很明显的发现数据可能具有周期性,
## 不能很好的确定参数p和q的取值,根据图可知,序列可能具有年周期性
## 使用auto.arima()函数确定模型的参数
auto.arima(AirPas_train)
## Series: AirPas_train
## ARIMA(1,1,0)(0,1,0)[12]
##
## Coefficients:
## ar1
## -0.2397
## s.e. 0.0935
##
## sigma^2 estimated as 103.6: log likelihood=-399.64
## AIC=803.28 AICc=803.4 BIC=808.63
## 最好的模型为ARIMA(1,1,0)(0,1,0)[12]
ARIMA <- arima(AirPas_train, c(1, 1, 0),
seasonal = list(order = c(0, 1, 0),period = 12))
summary(ARIMA)
##
## Call:
## arima(x = AirPas_train, order = c(1, 1, 0), seasonal = list(order = c(0, 1,
## 0), period = 12))
##
## Coefficients:
## ar1
## -0.2397
## s.e. 0.0935
##
## sigma^2 estimated as 102.7: log likelihood = -399.64, aic = 803.28
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01614662 9.567988 7.120167 -0.03346415 2.90195 0.321312
## ACF1
## Training set 0.00821521
Box.test(ARIMA$residuals,type ="Ljung-Box")
##
## Box-Ljung test
##
## data: ARIMA$residuals
## X-squared = 0.0083029, df = 1, p-value = 0.9274
## p-value = 0.9274,此时,模型的残差已经是白噪声数据,数据中的信息已经充分的提取出来了
## 可视化模型的预测值和这是值之间的差距
par(family = "STKaiti")
plot(forecast(ARIMA,h=24),shadecols="oldstyle")
points(AirPas_test,col = "red")
lines(AirPas_test,col = "red")
library(readxl)
# library(TSA)
## 读取数据
gasco2 <- read_excel("data/chap7/gas furnace data.xlsx")
head(gasco2)
## # A tibble: 6 x 2
## GasRate `C02%`
## <dbl> <dbl>
## 1 -0.109 53.8
## 2 0 53.6
## 3 0.178 53.5
## 4 0.339 53.5
## 5 0.373 53.4
## 6 0.441 53.1
GasRate <- ts(gasco2$GasRate)
CO2 <- ts(gasco2$`C02%`)
p1 <- autoplot(GasRate)
p2 <- autoplot(CO2)
gridExtra::grid.arrange(p1,p2,nrow=2)
## 切分位训练集和测试集
trainnum <- round(nrow(gasco2)*0.8)
GasRate_train <- window(GasRate,end = trainnum)
GasRate_test <- window(GasRate,start = trainnum+1)
CO2_train <- window(CO2,end = trainnum)
CO2_test <- window(CO2,start = trainnum+1)
## 自动寻找合适的p,q
auto.arima(y=CO2_train,xreg = GasRate_train)
## Series: CO2_train
## Regression with ARIMA(2,1,2) errors
##
## Coefficients:
## ar1 ar2 ma1 ma2 xreg
## 1.4773 -0.7232 -0.3818 0.2742 0.0520
## s.e. 0.0858 0.0710 0.1011 0.0881 0.1113
##
## sigma^2 estimated as 0.1006: log likelihood=-62.46
## AIC=136.92 AICc=137.29 BIC=157.7
# ARIMAXmod <- arimax(CO2_train,order = c(2,1,2),xreg = GasRate_train)
#ARIMAXmod <- arima(CO2_train,order = c(2,1,2),xreg = GasRate_train)
ARIMAXmod <- Arima(CO2_train,order = c(2,1,2),xreg = GasRate_train)
summary(ARIMAXmod)
## Series: CO2_train
## Regression with ARIMA(2,1,2) errors
##
## Coefficients:
## ar1 ar2 ma1 ma2 xreg
## 1.4773 -0.7232 -0.3818 0.2742 0.0520
## s.e. 0.0858 0.0710 0.1011 0.0881 0.1113
##
## sigma^2 estimated as 0.1006: log likelihood=-62.46
## AIC=136.92 AICc=137.29 BIC=157.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.003694667 0.3131359 0.2320132 0.009857144 0.4359391 0.3913876
## ACF1
## Training set -0.00032096
## 可视化模型的预测值和真实值之间的差距
par(family = "STKaiti")
plot(forecast(ARIMAXmod,h=length(GasRate_test),xreg = GasRate_test))
lines(CO2_test,col="black")
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(zoo)
## 读取数据并对数据重新命名
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
colnames(AirPas) <- c("ds","y")
AirPas$ds <- as.yearmon(AirPas$ds)
head(AirPas)
## ds y
## 1 1 1949 112
## 2 2 1949 118
## 3 3 1949 132
## 4 4 1949 129
## 5 5 1949 121
## 6 6 1949 135
## 建立具有季节趋势的模型
model <- prophet(AirPas,growth = "linear",
yearly.seasonality = TRUE,weekly.seasonality = FALSE,
daily.seasonality = FALSE,seasonality.mode = "multiplicative")
## 预测后面两年的数据,并将预测结果可视化
future <- make_future_dataframe(model, periods = 24,freq = "month")
forecast <- predict(model, future)
plot(model, forecast)
## 可视化预测的组成部分,主要有线性趋势和季节趋势
prophet_plot_components(model, forecast)