xv6/fs.c

650 lines
17 KiB
C

// File system implementation. Five layers:
// + Blocks: allocator for raw disk blocks.
// + Log: crash recovery for multi-step updates.
// + Files: inode allocator, reading, writing, metadata.
// + Directories: inode with special contents (list of other inodes!)
// + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
//
// This file contains the low-level file system manipulation
// routines. The (higher-level) system call implementations
// are in sysfile.c.
#include "types.h"
#include "defs.h"
#include "param.h"
#include "stat.h"
#include "mmu.h"
#include "proc.h"
#include "spinlock.h"
#include "sleeplock.h"
#include "fs.h"
#include "buf.h"
#include "file.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
static void itrunc(struct inode*);
// there should be one superblock per disk device, but we run with
// only one device
struct superblock sb;
// Read the super block.
void readsb(int dev, struct superblock *sb) {
struct buf *bp;
bp = bread(dev, 1);
memmove(sb, bp->data, sizeof(*sb));
brelse(bp);
}
// Zero a block.
static void bzero(int dev, int bno) {
struct buf *bp;
bp = bread(dev, bno);
memset(bp->data, 0, BSIZE);
log_write(bp);
brelse(bp);
}
// Blocks.
// Allocate a zeroed disk block.
static uint balloc(uint dev) {
int b, bi, m;
struct buf *bp;
bp = 0;
for (b = 0; b < sb.size; b += BPB) {
bp = bread(dev, BBLOCK(b, sb));
for (bi = 0; bi < BPB && b + bi < sb.size; bi++) {
m = 1 << (bi % 8);
if ((bp->data[bi / 8] & m) == 0) { // Is block free?
bp->data[bi / 8] |= m; // Mark block in use.
log_write(bp);
brelse(bp);
bzero(dev, b + bi);
return b + bi;
}
}
brelse(bp);
}
panic("balloc: out of blocks");
}
// Free a disk block.
static void bfree(int dev, uint b) {
struct buf *bp;
int bi, m;
bp = bread(dev, BBLOCK(b, sb));
bi = b % BPB;
m = 1 << (bi % 8);
if ((bp->data[bi / 8] & m) == 0) {
panic("freeing free block");
}
bp->data[bi / 8] &= ~m;
log_write(bp);
brelse(bp);
}
// Inodes.
//
// An inode describes a single unnamed file.
// The inode disk structure holds metadata: the file's type,
// its size, the number of links referring to it, and the
// list of blocks holding the file's content.
//
// The inodes are laid out sequentially on disk at
// sb.startinode. Each inode has a number, indicating its
// position on the disk.
//
// The kernel keeps a cache of in-use inodes in memory
// to provide a place for synchronizing access
// to inodes used by multiple processes. The cached
// inodes include book-keeping information that is
// not stored on disk: ip->ref and ip->valid.
//
// An inode and its in-memory representation go through a
// sequence of states before they can be used by the
// rest of the file system code.
//
// * Allocation: an inode is allocated if its type (on disk)
// is non-zero. ialloc() allocates, and iput() frees if
// the reference and link counts have fallen to zero.
//
// * Referencing in cache: an entry in the inode cache
// is free if ip->ref is zero. Otherwise ip->ref tracks
// the number of in-memory pointers to the entry (open
// files and current directories). iget() finds or
// creates a cache entry and increments its ref; iput()
// decrements ref.
//
// * Valid: the information (type, size, &c) in an inode
// cache entry is only correct when ip->valid is 1.
// ilock() reads the inode from
// the disk and sets ip->valid, while iput() clears
// ip->valid if ip->ref has fallen to zero.
//
// * Locked: file system code may only examine and modify
// the information in an inode and its content if it
// has first locked the inode.
//
// Thus a typical sequence is:
// ip = iget(dev, inum)
// ilock(ip)
// ... examine and modify ip->xxx ...
// iunlock(ip)
// iput(ip)
//
// ilock() is separate from iget() so that system calls can
// get a long-term reference to an inode (as for an open file)
// and only lock it for short periods (e.g., in read()).
// The separation also helps avoid deadlock and races during
// pathname lookup. iget() increments ip->ref so that the inode
// stays cached and pointers to it remain valid.
//
// Many internal file system functions expect the caller to
// have locked the inodes involved; this lets callers create
// multi-step atomic operations.
//
// The icache.lock spin-lock protects the allocation of icache
// entries. Since ip->ref indicates whether an entry is free,
// and ip->dev and ip->inum indicate which i-node an entry
// holds, one must hold icache.lock while using any of those fields.
//
// An ip->lock sleep-lock protects all ip-> fields other than ref,
// dev, and inum. One must hold ip->lock in order to
// read or write that inode's ip->valid, ip->size, ip->type, &c.
struct {
struct spinlock lock;
struct inode inode[NINODE];
} icache;
void iinit(int dev) {
int i = 0;
initlock(&icache.lock, "icache");
for (i = 0; i < NINODE; i++) {
initsleeplock(&icache.inode[i].lock, "inode");
}
readsb(dev, &sb);
cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
inodestart %d bmap start %d\n", sb.size, sb.nblocks,
sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
sb.bmapstart);
}
static struct inode* iget(uint dev, uint inum);
// Allocate an inode on device dev.
// Mark it as allocated by giving it type type.
// Returns an unlocked but allocated and referenced inode.
struct inode* ialloc(uint dev, short type) {
int inum;
struct buf *bp;
struct dinode *dip;
for (inum = 1; inum < sb.ninodes; inum++) {
bp = bread(dev, IBLOCK(inum, sb));
dip = (struct dinode*)bp->data + inum % IPB;
if (dip->type == 0) { // a free inode
memset(dip, 0, sizeof(*dip));
dip->type = type;
log_write(bp); // mark it allocated on the disk
brelse(bp);
return iget(dev, inum);
}
brelse(bp);
}
panic("ialloc: no inodes");
}
// Copy a modified in-memory inode to disk.
// Must be called after every change to an ip->xxx field
// that lives on disk, since i-node cache is write-through.
// Caller must hold ip->lock.
void iupdate(struct inode *ip) {
struct buf *bp;
struct dinode *dip;
bp = bread(ip->dev, IBLOCK(ip->inum, sb));
dip = (struct dinode*)bp->data + ip->inum % IPB;
dip->type = ip->type;
dip->major = ip->major;
dip->minor = ip->minor;
dip->nlink = ip->nlink;
dip->size = ip->size;
memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
log_write(bp);
brelse(bp);
}
// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode* iget(uint dev, uint inum) {
struct inode *ip, *empty;
acquire(&icache.lock);
// Is the inode already cached?
empty = 0;
for (ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++) {
if (ip->ref > 0 && ip->dev == dev && ip->inum == inum) {
ip->ref++;
release(&icache.lock);
return ip;
}
if (empty == 0 && ip->ref == 0) { // Remember empty slot.
empty = ip;
}
}
// Recycle an inode cache entry.
if (empty == 0) {
panic("iget: no inodes");
}
ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref = 1;
ip->valid = 0;
release(&icache.lock);
return ip;
}
// Increment reference count for ip.
// Returns ip to enable ip = idup(ip1) idiom.
struct inode* idup(struct inode *ip) {
acquire(&icache.lock);
ip->ref++;
release(&icache.lock);
return ip;
}
// Lock the given inode.
// Reads the inode from disk if necessary.
void ilock(struct inode *ip) {
struct buf *bp;
struct dinode *dip;
if (ip == 0 || ip->ref < 1) {
panic("ilock");
}
acquiresleep(&ip->lock);
if (ip->valid == 0) {
bp = bread(ip->dev, IBLOCK(ip->inum, sb));
dip = (struct dinode*)bp->data + ip->inum % IPB;
ip->type = dip->type;
ip->major = dip->major;
ip->minor = dip->minor;
ip->nlink = dip->nlink;
ip->size = dip->size;
memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
brelse(bp);
ip->valid = 1;
if (ip->type == 0) {
panic("ilock: no type");
}
}
}
// Unlock the given inode.
void iunlock(struct inode *ip) {
if (ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) {
panic("iunlock");
}
releasesleep(&ip->lock);
}
// Drop a reference to an in-memory inode.
// If that was the last reference, the inode cache entry can
// be recycled.
// If that was the last reference and the inode has no links
// to it, free the inode (and its content) on disk.
// All calls to iput() must be inside a transaction in
// case it has to free the inode.
void iput(struct inode *ip) {
acquiresleep(&ip->lock);
if (ip->valid && ip->nlink == 0) {
acquire(&icache.lock);
int r = ip->ref;
release(&icache.lock);
if (r == 1) {
// inode has no links and no other references: truncate and free.
itrunc(ip);
ip->type = 0;
iupdate(ip);
ip->valid = 0;
}
}
releasesleep(&ip->lock);
acquire(&icache.lock);
ip->ref--;
release(&icache.lock);
}
// Common idiom: unlock, then put.
void iunlockput(struct inode *ip) {
iunlock(ip);
iput(ip);
}
// Inode content
//
// The content (data) associated with each inode is stored
// in blocks on the disk. The first NDIRECT block numbers
// are listed in ip->addrs[]. The next NINDIRECT blocks are
// listed in block ip->addrs[NDIRECT].
// Return the disk block address of the nth block in inode ip.
// If there is no such block, bmap allocates one.
static uint bmap(struct inode *ip, uint bn) {
uint addr, *a;
struct buf *bp;
if (bn < NDIRECT) {
if ((addr = ip->addrs[bn]) == 0) {
ip->addrs[bn] = addr = balloc(ip->dev);
}
return addr;
}
bn -= NDIRECT;
if (bn < NINDIRECT) {
// Load indirect block, allocating if necessary.
if ((addr = ip->addrs[NDIRECT]) == 0) {
ip->addrs[NDIRECT] = addr = balloc(ip->dev);
}
bp = bread(ip->dev, addr);
a = (uint*)bp->data;
if ((addr = a[bn]) == 0) {
a[bn] = addr = balloc(ip->dev);
log_write(bp);
}
brelse(bp);
return addr;
}
panic("bmap: out of range");
}
// Truncate inode (discard contents).
// Only called when the inode has no links
// to it (no directory entries referring to it)
// and has no in-memory reference to it (is
// not an open file or current directory).
static void itrunc(struct inode *ip) {
int i, j;
struct buf *bp;
uint *a;
for (i = 0; i < NDIRECT; i++) {
if (ip->addrs[i]) {
bfree(ip->dev, ip->addrs[i]);
ip->addrs[i] = 0;
}
}
if (ip->addrs[NDIRECT]) {
bp = bread(ip->dev, ip->addrs[NDIRECT]);
a = (uint*)bp->data;
for (j = 0; j < NINDIRECT; j++) {
if (a[j]) {
bfree(ip->dev, a[j]);
}
}
brelse(bp);
bfree(ip->dev, ip->addrs[NDIRECT]);
ip->addrs[NDIRECT] = 0;
}
ip->size = 0;
iupdate(ip);
}
// Copy stat information from inode.
// Caller must hold ip->lock.
void stati(struct inode *ip, struct stat *st) {
st->dev = ip->dev;
st->ino = ip->inum;
st->type = ip->type;
st->nlink = ip->nlink;
st->size = ip->size;
}
// Read data from inode.
// Caller must hold ip->lock.
int readi(struct inode *ip, char *dst, uint off, uint n) {
uint tot, m;
struct buf *bp;
if (ip->type == T_DEV) {
if (ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) {
return -1;
}
return devsw[ip->major].read(ip, dst, n);
}
if (off > ip->size || off + n < off) {
return -1;
}
if (off + n > ip->size) {
n = ip->size - off;
}
for (tot = 0; tot < n; tot += m, off += m, dst += m) {
bp = bread(ip->dev, bmap(ip, off / BSIZE));
m = min(n - tot, BSIZE - off % BSIZE);
memmove(dst, bp->data + off % BSIZE, m);
brelse(bp);
}
return n;
}
// Write data to inode.
// Caller must hold ip->lock.
int writei(struct inode *ip, char *src, uint off, uint n) {
uint tot, m;
struct buf *bp;
if (ip->type == T_DEV) {
if (ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write) {
return -1;
}
return devsw[ip->major].write(ip, src, n);
}
if (off > ip->size || off + n < off) {
return -1;
}
if (off + n > MAXFILE * BSIZE) {
return -1;
}
for (tot = 0; tot < n; tot += m, off += m, src += m) {
bp = bread(ip->dev, bmap(ip, off / BSIZE));
m = min(n - tot, BSIZE - off % BSIZE);
memmove(bp->data + off % BSIZE, src, m);
log_write(bp);
brelse(bp);
}
if (n > 0 && off > ip->size) {
ip->size = off;
iupdate(ip);
}
return n;
}
// Directories
int namecmp(const char *s, const char *t) {
return strncmp(s, t, DIRSIZ);
}
// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode* dirlookup(struct inode *dp, char *name, uint *poff) {
uint off, inum;
struct dirent de;
if (dp->type != T_DIR) {
panic("dirlookup not DIR");
}
for (off = 0; off < dp->size; off += sizeof(de)) {
if (readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) {
panic("dirlookup read");
}
if (de.inum == 0) {
continue;
}
if (namecmp(name, de.name) == 0) {
// entry matches path element
if (poff) {
*poff = off;
}
inum = de.inum;
return iget(dp->dev, inum);
}
}
return 0;
}
// Write a new directory entry (name, inum) into the directory dp.
int dirlink(struct inode *dp, char *name, uint inum) {
int off;
struct dirent de;
struct inode *ip;
// Check that name is not present.
if ((ip = dirlookup(dp, name, 0)) != 0) {
iput(ip);
return -1;
}
// Look for an empty dirent.
for (off = 0; off < dp->size; off += sizeof(de)) {
if (readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) {
panic("dirlink read");
}
if (de.inum == 0) {
break;
}
}
strncpy(de.name, name, DIRSIZ);
de.inum = inum;
if (writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de)) {
panic("dirlink");
}
return 0;
}
// Paths
// Copy the next path element from path into name.
// Return a pointer to the element following the copied one.
// The returned path has no leading slashes,
// so the caller can check *path=='\0' to see if the name is the last one.
// If no name to remove, return 0.
//
// Examples:
// skipelem("a/bb/c", name) = "bb/c", setting name = "a"
// skipelem("///a//bb", name) = "bb", setting name = "a"
// skipelem("a", name) = "", setting name = "a"
// skipelem("", name) = skipelem("////", name) = 0
//
static char* skipelem(char *path, char *name) {
char *s;
int len;
while (*path == '/') {
path++;
}
if (*path == 0) {
return 0;
}
s = path;
while (*path != '/' && *path != 0) {
path++;
}
len = path - s;
if (len >= DIRSIZ) {
memmove(name, s, DIRSIZ);
}
else {
memmove(name, s, len);
name[len] = 0;
}
while (*path == '/') {
path++;
}
return path;
}
// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode* namex(char *path, int nameiparent, char *name) {
struct inode *ip, *next;
if (*path == '/') {
ip = iget(ROOTDEV, ROOTINO);
}
else {
ip = idup(myproc()->cwd);
}
while ((path = skipelem(path, name)) != 0) {
ilock(ip);
if (ip->type != T_DIR) {
iunlockput(ip);
return 0;
}
if (nameiparent && *path == '\0') {
// Stop one level early.
iunlock(ip);
return ip;
}
if ((next = dirlookup(ip, name, 0)) == 0) {
iunlockput(ip);
return 0;
}
iunlockput(ip);
ip = next;
}
if (nameiparent) {
iput(ip);
return 0;
}
return ip;
}
struct inode* namei(char *path) {
char name[DIRSIZ];
return namex(path, 0, name);
}
struct inode*nameiparent(char *path, char *name) {
return namex(path, 1, name);
}