Crash Recovery

Logging

To achieve a atomic transaction, xv6 use logging to avoid data inconsistence if crash happens while writing. XV6’s syscall won’t write through the inode’s block, instead, it writes ops into log, after the transaction is committed, then xv6 itself writes the log into disk.

Log design

Disk has a continuous space for logging storage, and consists of a header block foe meta info and a bunch of block copy. header block records block index and block number. In xv6, just one transaction is doing at one time.

group commit, xv6 could wrap many syscall()s and pack them into one transaction to increase parallelism. Also, because of limited log block, XV6 will split one large write into many little transactions to fit log.

  • Write Ahead: Only modified blocks are written into log blocks then system starts to write into home location.
  • Freeing: Till all log blocks written into home location and header block is wiped, then we free log block.

Code Analysis:

  • Start with begin_op(), to tell OS I’m gonna start a safe atomic transaction.

    log.outstanding is the number of syscall that queues at the current transaction.

    MAXBLOCKS is the threshold one syscall could use.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void
begin_op(void)
{
acquire(&log.lock);
while(1){
if(log.committing){
sleep(&log, &log.lock);
} else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){
// this op might exhaust log space; wait for commit.
sleep(&log, &log.lock);
} else {
log.outstanding += 1;
release(&log.lock);
break;
}
}
}
  • log_write(struct buf* b) . Write the modified block index in the header. This function will reserve a slot for this buf by increasing header->n , then pin this buf in the buffer cache (to meet the requirement of Write Ahead Rule).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// Caller has modified b->data and is done with the buffer.
// Record the block number and pin in the cache by increasing refcnt.
// commit()/write_log() will do the disk write.
//
// log_write() replaces bwrite(); a typical use is:
// bp = bread(...)
// modify bp->data[]
// log_write(bp)
// brelse(bp)
void
log_write(struct buf *b)
{
int i;

if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
panic("too big a transaction");
if (log.outstanding < 1)
panic("log_write outside of trans");

acquire(&log.lock);
for (i = 0; i < log.lh.n; i++) {
if (log.lh.block[i] == b->blockno) // log absorbtion
break;
}
log.lh.block[i] = b->blockno;
if (i == log.lh.n) { // Add new block to log?
bpin(b);
log.lh.n++;
}
release(&log.lock);
}
  • end_op(), decrease outstanding, and do commit if outstanding becomes zero.

    Note wakeup(&log) is to wake up other process blocked on channel log. Because in end_up(), current process don’t need such reserved space as it claimed at begin_op()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void
end_op(void)
{
int do_commit = 0;

acquire(&log.lock);
log.outstanding -= 1;
if(log.committing)
panic("log.committing");
if(log.outstanding == 0){
do_commit = 1;
log.committing = 1;
} else {
// begin_op() may be waiting for log space,
// and decrementing log.outstanding has decreased
// the amount of reserved space.
wakeup(&log);
}
release(&log.lock);

if(do_commit){
// call commit w/o holding locks, since not allowed
// to sleep with locks.
commit();
acquire(&log.lock);
log.committing = 0;
wakeup(&log);
release(&log.lock);
}
}
  • commit()
1
2
3
4
5
6
7
8
9
10
11
static void
commit()
{
if (log.lh.n > 0) {
write_log(); // Write modified blocks from cache to log
write_head(); // Write header to disk -- the real commit
install_trans(0); // Now install writes to home locations
log.lh.n = 0;
write_head(); // Erase the transaction from the log
}
}

write_log() , write all modified buf into log block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// Copy modified blocks from cache to log.
static void
write_log(void)
{
int tail;

for (tail = 0; tail < log.lh.n; tail++) {
struct buf *to = bread(log.dev, log.start+tail+1); // log block
struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block
memmove(to->data, from->data, BSIZE);
bwrite(to); // write the log
brelse(from);
brelse(to);
}
}

write_head(), write header block into disk, which is real a commit starts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// Write in-memory log header to disk.
// This is the true point at which the
// current transaction commits.
static void
write_head(void)
{
struct buf *buf = bread(log.dev, log.start);
struct logheader *hb = (struct logheader *) (buf->data);
int i;
hb->n = log.lh.n;
for (i = 0; i < log.lh.n; i++) {
hb->block[i] = log.lh.block[i];
}
bwrite(buf);
brelse(buf);
}

And left two functions is easy to understand their functionality from their name.

install_trans, write log block into home data block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static void
install_trans(int recovering)
{
int tail;

for (tail = 0; tail < log.lh.n; tail++) {
struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst
memmove(dbuf->data, lbuf->data, BSIZE); // copy block to dst
bwrite(dbuf); // write dst to disk
if(recovering == 0)
bunpin(dbuf);
brelse(lbuf);
brelse(dbuf);
}
}