Skip to content

Commit ffc67bb

Browse files
committed
more standard indentation for C++ part
1 parent 007b5f6 commit ffc67bb

File tree

2 files changed

+199
-230
lines changed

2 files changed

+199
-230
lines changed

_posts/2014-01-04-bayesian-time-series-changepoint.md

+101-120
Original file line numberDiff line numberDiff line change
@@ -44,117 +44,107 @@ The initial naive R implementation of the sampler is given below.
4444
# kposs: possible values of k
4545
# phi, k: starting values of chain for phi and k
4646

47-
gibbsloop <- function(nsim, y, a, b, c, d, kposs, phi, k)
48-
{
49-
# matrix to store simulated values from each cycle
50-
out <- matrix(NA, nrow = nsim, ncol = 3)
51-
52-
# number of observations
53-
n <- length(y)
54-
55-
for(i in 1:nsim)
56-
{
57-
# Generate value from full conditional of phi based on
58-
# current values of other parameters
59-
lambda <- rgamma(1, a + sum(y[1:k]), b + k)
60-
61-
# Generate value from full conditional of phi based on
62-
# current values of other parameters
63-
phi <- rgamma(1, c + sum(y[min((k+1),n):n]), d + n - k)
64-
65-
# generate value of k
66-
# determine probability masses for full conditional of k
67-
# based on current parameters values
68-
pmf <- kprobloop(kposs, y, lambda, phi)
69-
k <- sample(x = kposs, size = 1, prob = pmf)
70-
71-
out[i, ] <- c(lambda, phi, k)
72-
}
73-
out
47+
gibbsloop <- function(nsim, y, a, b, c, d, kposs, phi, k) {
48+
# matrix to store simulated values from each cycle
49+
out <- matrix(NA, nrow = nsim, ncol = 3)
50+
51+
# number of observations
52+
n <- length(y)
53+
54+
for(i in 1:nsim) {
55+
# Generate value from full conditional of phi based on
56+
# current values of other parameters
57+
lambda <- rgamma(1, a + sum(y[1:k]), b + k)
58+
59+
# Generate value from full conditional of phi based on
60+
# current values of other parameters
61+
phi <- rgamma(1, c + sum(y[min((k+1),n):n]), d + n - k)
62+
63+
# generate value of k
64+
# determine probability masses for full conditional of k
65+
# based on current parameters values
66+
pmf <- kprobloop(kposs, y, lambda, phi)
67+
k <- sample(x = kposs, size = 1, prob = pmf)
68+
69+
out[i, ] <- c(lambda, phi, k)
70+
}
71+
out
7472
}
7573

7674
# Given a vector of values x, the logsumexp function calculates
7775
# log(sum(exp(x))), but in a "smart" way that helps avoid
7876
# numerical underflow or overflow problems.
7977

80-
logsumexp <- function(x)
81-
{
82-
log(sum(exp(x - max(x)))) + max(x)
78+
logsumexp <- function(x) {
79+
log(sum(exp(x - max(x)))) + max(x)
8380
}
8481

8582
# Determine pmf for full conditional of k based on current values of other
8683
# variables. Takes possible values of k, observed data y, current values of
8784
# lambda, and phi. It does this naively using a loop.
8885

89-
kprobloop <- function(kposs, y, lambda, phi)
90-
{
91-
# create vector to store argument of exponential function of
92-
# unnormalized pmf, then calculate them
93-
x <- numeric(length(kposs))
94-
for(i in kposs)
95-
{
96-
x[i] <- i*(phi - lambda) + sum(y[1:i]) * log(lambda/phi)
97-
}
98-
#return full conditional pmf of k
99-
return(exp(x - logsumexp(x)))
86+
kprobloop <- function(kposs, y, lambda, phi) {
87+
# create vector to store argument of exponential function of
88+
# unnormalized pmf, then calculate them
89+
x <- numeric(length(kposs))
90+
for (i in kposs) {
91+
x[i] <- i*(phi - lambda) + sum(y[1:i]) * log(lambda/phi)
92+
}
93+
# return full conditional pmf of k
94+
return(exp(x - logsumexp(x)))
10095
}
10196
{% endhighlight %}
10297

103-
10498
Looking through the first implemention, we realize that we can
10599
speed up our code by vectorizing when possible and not performing computations
106100
more often than we have to. Specifically, we are calculating `sum(y[1:i])` and `sum(y[min((k+1),n):n])` repeatedly in the loops. We can dramatically improve the speed of our implementation by calculating the sum of `y` and the cumulative sum of `y` and then using the stored results appropriately. We now reimplement the pmf and Gibbs sampler functions with this in
107101
mind.
108102

109103

110104
{% highlight r %}
111-
gibbsvec <- function(nsim, y, a, b, c, d, kposs, phi, k)
112-
{
113-
# matrix to store simulated values from each cycle
114-
out <- matrix(NA, nrow = nsim, ncol = 3)
115-
116-
# determine number of observations
117-
n <- length(y)
118-
119-
# determine sum of y and cumulative sum of y.
120-
# Then cusum[k] == sum(y[1:k])
121-
# and sum(y[(k+1):n]) == sumy - cusum[k]
122-
sumy <- sum(y)
123-
cusum <- cumsum(y)
124-
125-
for(i in 1:nsim)
126-
{
127-
# Generate value from full conditional of phi based on
128-
# current values of other parameters
129-
lambda <- rgamma(1, a + cusum[k], b + k)
105+
gibbsvec <- function(nsim, y, a, b, c, d, kposs, phi, k) {
106+
# matrix to store simulated values from each cycle
107+
out <- matrix(NA, nrow = nsim, ncol = 3)
108+
109+
# determine number of observations
110+
n <- length(y)
111+
112+
# determine sum of y and cumulative sum of y.
113+
# Then cusum[k] == sum(y[1:k])
114+
# and sum(y[(k+1):n]) == sumy - cusum[k]
115+
sumy <- sum(y)
116+
cusum <- cumsum(y)
117+
118+
for (i in 1:nsim) {
119+
# Generate value from full conditional of phi based on
120+
# current values of other parameters
121+
lambda <- rgamma(1, a + cusum[k], b + k)
130122
131-
# Generate value from full conditional of phi based on
132-
# current values of other parameters
133-
phi <- rgamma(1, c + sumy - cusum[k], d + n - k)
123+
# Generate value from full conditional of phi based on
124+
# current values of other parameters
125+
phi <- rgamma(1, c + sumy - cusum[k], d + n - k)
134126
135-
# generate value of k
136-
pmf <- kprobvec(kposs, cusum, lambda, phi)
137-
k <- sample(x = kposs, size = 1, prob = pmf)
127+
# generate value of k
128+
pmf <- kprobvec(kposs, cusum, lambda, phi)
129+
k <- sample(x = kposs, size = 1, prob = pmf)
138130
139-
out[i, ] <- c(lambda, phi, k)
140-
}
141-
out
131+
out[i, ] <- c(lambda, phi, k)
132+
}
133+
out
142134
}
143135

144136
# Determine pmf for full conditional of k based on current values of other
145137
# variables. Do this efficiently using vectors and stored information.
146138
# cusum is the cumulative sum of y.
147139

148-
kprobvec <- function(kposs, cusum, lambda, phi)
149-
{
150-
# calculate exponential argument of numerator of unnormalized pmf
151-
x <- kposs * (phi - lambda) + cusum * log(lambda/phi)
152-
# return pmf of full conditional for k
153-
return(exp(x - logsumexp(x)))
140+
kprobvec <- function(kposs, cusum, lambda, phi) {
141+
# calculate exponential argument of numerator of unnormalized pmf
142+
x <- kposs * (phi - lambda) + cusum * log(lambda/phi)
143+
# return pmf of full conditional for k
144+
return(exp(x - logsumexp(x)))
154145
}
155146
{% endhighlight %}
156147

157-
158148
We should be able to improve the speed of our sampler by implementing it in C++. We can reimplement the vectorized version of the sampler fairly easily using Rcpp and RcppArmadillo. Note that the parameterization of `rgamma` used by Rcpp is slightly different from base R.
159149

160150

@@ -164,52 +154,47 @@ We should be able to improve the speed of our sampler by implementing it in C++.
164154
using namespace Rcpp;
165155

166156
// [[Rcpp::export]]
167-
double logsumexp(NumericVector x)
168-
{
169-
return(log(sum(exp(x - max(x)))) + max(x));
157+
double logsumexp(NumericVector x) {
158+
return(log(sum(exp(x - max(x)))) + max(x));
170159
}
171160

172161
// [[Rcpp::export]]
173-
NumericVector kprobcpp(NumericVector kposs, NumericVector cusum, double lambda, double phi)
174-
{
175-
NumericVector x = kposs * (phi - lambda) + cusum * log(lambda/phi);
176-
return(exp(x - logsumexp(x)));
162+
NumericVector kprobcpp(NumericVector kposs, NumericVector cusum, double lambda, double phi) {
163+
NumericVector x = kposs * (phi - lambda) + cusum * log(lambda/phi);
164+
return(exp(x - logsumexp(x)));
177165
}
178166

179167
// [[Rcpp::export]]
180168
NumericMatrix gibbscpp(int nsim, NumericVector y, double a, double b, double c,
181-
double d, NumericVector kposs, NumericVector phi, NumericVector k)
182-
{
183-
RNGScope scope ;
184-
185-
NumericVector lambda;
186-
NumericVector pmf;
187-
NumericMatrix out(nsim, 3);
188-
189-
// determine sum of y and cumulative sum of y.
190-
double sumy = sum(y);
191-
NumericVector cusum = cumsum(y);
192-
193-
for(int i = 0; i < nsim; i++)
194-
{
195-
lambda = rgamma(1, a + cusum[k[0] - 1], 1/(b + k[0]));
196-
phi = rgamma(1, c + sumy - cusum[k[0] - 1], 1/(d + 112 - k[0]));
197-
198-
pmf = kprobcpp(kposs, cusum, lambda[0], phi[0]);
199-
k = RcppArmadillo::sample(kposs, 1, false, pmf) ;
200-
201-
// store values of this cycle
202-
out(i, 0) = lambda[0];
203-
out(i, 1) = phi[0];
204-
out(i, 2) = k[0];
205-
}
206-
207-
// return results
208-
return out;
169+
double d, NumericVector kposs, NumericVector phi, NumericVector k) {
170+
RNGScope scope ;
171+
172+
NumericVector lambda;
173+
NumericVector pmf;
174+
NumericMatrix out(nsim, 3);
175+
176+
// determine sum of y and cumulative sum of y.
177+
double sumy = sum(y);
178+
NumericVector cusum = cumsum(y);
179+
180+
for(int i = 0; i < nsim; i++) {
181+
lambda = rgamma(1, a + cusum[k[0] - 1], 1/(b + k[0]));
182+
phi = rgamma(1, c + sumy - cusum[k[0] - 1], 1/(d + 112 - k[0]));
183+
184+
pmf = kprobcpp(kposs, cusum, lambda[0], phi[0]);
185+
k = RcppArmadillo::sample(kposs, 1, false, pmf) ;
186+
187+
// store values of this cycle
188+
out(i, 0) = lambda[0];
189+
out(i, 1) = phi[0];
190+
out(i, 2) = k[0];
191+
}
192+
193+
// return results
194+
return out;
209195
}
210196
{% endhighlight %}
211197

212-
213198
We will now compare and apply the implementations to a coal mining data set.
214199

215200
Coal mining is a notoriously dangerous occupation. Consider the tally of
@@ -230,7 +215,6 @@ counts <- c(4,5,4,1,0,4,3,4,0,6,3,3,4,0,2,6,3,3,5,4,5,3,1,4,4,1,5,5,3,4,2,
230215
0,0,1,4,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1)
231216
{% endhighlight %}
232217

233-
234218
For simplicity, we'll refer to 1851 as year 1, 1852 as year 2, ..., 1962 as
235219
year 112. We will set the initial values for `phi` and `k` to be 1 and 40,
236220
respectively. We will set the hyperparameters `a`, `b`, `c`, and `d` to be 4, 1, 1, and 2, respectively.
@@ -256,7 +240,6 @@ all.equal(chain1, chain2, chain3)
256240
[1] TRUE
257241
</pre>
258242

259-
260243
We now compare compare the relative speed of the three implementations using the `benchmark` function.
261244

262245

@@ -276,12 +259,11 @@ benchmark(gibbsloop(nsim=nsim,y=counts,a=4,b=1,c=1,d=2,kposs=1:112,phi=1,k=40),
276259
2 gibbsvec(nsim = nsim, y = counts, a = 4, b = 1, c = 1, d = 2, kposs = 1:112, phi = 1, k = 40)
277260
1 gibbsloop(nsim = nsim, y = counts, a = 4, b = 1, c = 1, d = 2, kposs = 1:112, phi = 1, k = 40)
278261
replications elapsed relative
279-
3 10 2.392 1.000
280-
2 10 7.044 2.945
281-
1 10 116.531 48.717
262+
3 10 2.207 1.000
263+
2 10 5.611 2.542
264+
1 10 68.118 30.865
282265
</pre>
283266

284-
285267
The C++ version of the Gibbs sampler is a vast improvement over the looped R
286268
version and also quite a bit faster than the vectorized R version.
287269

@@ -323,4 +305,3 @@ colored blue.
323305
#lines(yr[1:changeyear], counts[1:changeyear], col = "orange")
324306
#lines(yr[-(1:(changeyear-1))], counts[-(1:(changeyear-1))], col = "blue")
325307
{% endhighlight %}
326-

0 commit comments

Comments
 (0)