红联Linux门户
Linux帮助

Linux虚拟文件系统(内核初始化<二>)

发布时间:2014-11-25 15:19:29来源:linux网站作者:bullbat

这部分主要对linux虚拟文件系统内核初始化部分做些补充。

关于shrinker,inode和dentry cache初始化阶段都需要注册自己的shrinker,用于缩减cache。两个操作原理类似。

shrinker数据结构介绍
/*
 * A callback you can register to apply pressure to ageable caches.
 *
 * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
 * look through the least-recently-used 'nr_to_scan' entries and
 * attempt to free them up.  It should return the number of objects
 * which remain in the cache.  If it returns -1, it means it cannot do
 * any scanning at this time (eg. there is a risk of deadlock).
 *
 * The 'gfpmask' refers to the allocation we are currently trying to
 * fulfil.
 *
 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
 * querying the cache size, so a fastpath for that case is appropriate.
 */ 
struct shrinker { 
int (*shrink)(int nr_to_scan, gfp_t gfp_mask); 
int seeks;  /* seeks to recreate an obj */ 
 
/* These are for internal use */ 
struct list_head list; 
long nr;/* objs pending delete */ 
};
 

1,注册inode cache shrinker

Start_kernel()->vfs_caches_init()->dcache_init()->register_shrinker(&dcache_shrinker);

/*
 * Add a shrinker callback to be called from the vm
 */ 
void register_shrinker(struct shrinker *shrinker) 

shrinker->nr = 0; 
down_write(&shrinker_rwsem); 
list_add_tail(&shrinker->list, &shrinker_list); 
up_write(&shrinker_rwsem); 

其中相关的函数在这里定义。

static struct shrinker dcache_shrinker = { 
.shrink = shrink_dcache_memory, 
.seeks = DEFAULT_SEEKS, 
}; 

/*
 * Scan `nr' dentries and return the number which remain.
 *
 * We need to avoid reentering the filesystem if the caller is performing a
 * GFP_NOFS allocation attempt.  One example deadlock is:
 *
 * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
 * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
 * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
 *
 * In this case we return -1 to tell the caller that we baled.
 */ 
static int shrink_dcache_memory(int nr, gfp_t gfp_mask) 

if (nr) { 
if (!(gfp_mask & __GFP_FS)) 
return -1; 
prune_dcache(nr);/*缩减指定大小的cache*/ 

return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 

/**
 * prune_dcache - shrink the dcache
 * @count: number of entries to try to free
 *
 * Shrink the dcache. This is done when we need more memory, or simply when we
 * need to unmount something (at which point we need to unuse all dentries).
 *
 * This function may fail to free any resources if all the dentries are in use.
 */ 
 /*缩减dcache,count为释放的数量*/ 
static void prune_dcache(int count) 

struct super_block *sb; 
int w_count; 
int unused = dentry_stat.nr_unused; 
int prune_ratio; 
int pruned; 
 
if (unused == 0 || count == 0) 
return; 
spin_lock(&dcache_lock); 
restart: 
if (count >= unused) 
prune_ratio = 1;/*释放率*/ 
else 
prune_ratio = unused / count; 
spin_lock(&sb_lock); 
list_for_each_entry(sb, &super_blocks, s_list) { 
if (sb->s_nr_dentry_unused == 0) 
continue; 
sb->s_count++; 
/* Now, we reclaim unused dentrins with fairness.
 * We reclaim them same percentage from each superblock.
 * We calculate number of dentries to scan on this sb
 * as follows, but the implementation is arranged to avoid
 * overflows:
 * number of dentries to scan on this sb =
 * count * (number of dentries on this sb /
 * number of dentries in the machine)
 */ 
spin_unlock(&sb_lock); 
/*重新利用释放率计算释放量*/ 
if (prune_ratio != 1) 
w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; 
else 
w_count = sb->s_nr_dentry_unused; 
pruned = w_count; 
/*
 * We need to be sure this filesystem isn't being unmounted,
 * otherwise we could race with generic_shutdown_super(), and
 * end up holding a reference to an inode while the filesystem
 * is unmounted.  So we try to get s_umount, and make sure
 * s_root isn't NULL.
 */ 
if (down_read_trylock(&sb->s_umount)) { 
if ((sb->s_root != NULL) && 
(!list_empty(&sb->s_dentry_lru))) { 
spin_unlock(&dcache_lock); 
/*实际释放工作*/ 
__shrink_dcache_sb(sb, &w_count, 
DCACHE_REFERENCED); 
pruned -= w_count; 
spin_lock(&dcache_lock); 

up_read(&sb->s_umount); 

spin_lock(&sb_lock); 
count -= pruned; 
/*
 * restart only when sb is no longer on the list and
 * we have more work to do.
 */ 
if (__put_super_and_need_restart(sb) && count > 0) { 
spin_unlock(&sb_lock); 
goto restart; 


spin_unlock(&sb_lock); 
spin_unlock(&dcache_lock); 
}

/*
 * Shrink the dentry LRU on a given superblock.
 * @sb   : superblock to shrink dentry LRU.
 * @count: If count is NULL, we prune all dentries on superblock.
 * @flags: If flags is non-zero, we need to do special processing based on
 * which flags are set. This means we don't need to maintain multiple
 * similar copies of this loop.
 */ 
static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) 

LIST_HEAD(referenced); 
LIST_HEAD(tmp); 
struct dentry *dentry; 
int cnt = 0; 
 
BUG_ON(!sb); 
BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); 
spin_lock(&dcache_lock); 
if (count != NULL) 
/* called from prune_dcache() and shrink_dcache_parent() */ 
cnt = *count;/*在下面用到*/ 
restart: 
if (count == NULL) 
list_splice_init(&sb->s_dentry_lru, &tmp); 
else { 
while (!list_empty(&sb->s_dentry_lru)) { 
dentry = list_entry(sb->s_dentry_lru.prev, 
struct dentry, d_lru); 
BUG_ON(dentry->d_sb != sb); 
 
spin_lock(&dentry->d_lock); 
/*
 * If we are honouring the DCACHE_REFERENCED flag and
 * the dentry has this flag set, don't free it. Clear
 * the flag and put it back on the LRU.
 */ 
 /*清flag对应位,将链表元素放LRU尾部*/ 
if ((flags & DCACHE_REFERENCED) 
&& (dentry->d_flags & DCACHE_REFERENCED)) { 
dentry->d_flags &= ~DCACHE_REFERENCED; 
list_move(&dentry->d_lru, &referenced); 
spin_unlock(&dentry->d_lock); 
} else { 
/*从d_lru链表中删除,加到tmp链表中*/ 
list_move_tail(&dentry->d_lru, &tmp); 
spin_unlock(&dentry->d_lock); 
cnt--;/*数量减一*/ 
if (!cnt)/*减到0跳出循环*/ 
break; 

cond_resched_lock(&dcache_lock); 


/*对tmp中的每个元素,其中tmp中的元素为上面移过来的*/ 
while (!list_empty(&tmp)) { 
dentry = list_entry(tmp.prev, struct dentry, d_lru); 
/*从tmp中删除相关链表并做重新初始化和数据统计*/ 
dentry_lru_del_init(dentry); 
spin_lock(&dentry->d_lock); 
/*
 * We found an inuse dentry which was not removed from
 * the LRU because of laziness during lookup.  Do not free
 * it - just keep it off the LRU list.
 */ 
if (atomic_read(&dentry->d_count)) { 
spin_unlock(&dentry->d_lock); 
continue; 
}/*释放dentry和其父dentry*/ 
prune_one_dentry(dentry); 
/* dentry->d_lock was dropped in prune_one_dentry() */ 
cond_resched_lock(&dcache_lock); 

if (count == NULL && !list_empty(&sb->s_dentry_lru)) 
goto restart; 
if (count != NULL) 
*count = cnt; 
if (!list_empty(&referenced)) 
list_splice(&referenced, &sb->s_dentry_lru); 
spin_unlock(&dcache_lock); 

static void dentry_lru_del_init(struct dentry *dentry) 

if (likely(!list_empty(&dentry->d_lru))) { 
list_del_init(&dentry->d_lru);/*从链表中删除并初始化dentry->d_lru*/ 
dentry->d_sb->s_nr_dentry_unused--;/*未用数减一*/ 
dentry_stat.nr_unused--;/*更新统计数据*/ 

[html]

/* 
 * Throw away a dentry - free the inode, dput the parent.  This requires that 
 * the LRU list has already been removed. 
 * 
 * Try to prune ancestors as well.  This is necessary to prevent 
 * quadratic behavior of shrink_dcache_parent(), but is also expected 
 * to be beneficial in reducing dentry cache fragmentation. 
 */ 
static void prune_one_dentry(struct dentry * dentry) 
__releases(dentry->d_lock) 
__releases(dcache_lock) 
__acquires(dcache_lock) 

__d_drop(dentry); 
dentry = d_kill(dentry);/*释放dentry*/ 
 
/* 
 * Prune ancestors.  Locking is simpler than in dput(), 
 * because dcache_lock needs to be taken anyway. 
 */ 
spin_lock(&dcache_lock); 
while (dentry) { 
if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) 
return; 
 
if (dentry->d_op && dentry->d_op->d_delete) 
dentry->d_op->d_delete(dentry); 
dentry_lru_del_init(dentry); 
__d_drop(dentry); 
dentry = d_kill(dentry); 
spin_lock(&dcache_lock); 

}

/**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
 *
 * The dentry must already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
 */ 
static struct dentry *d_kill(struct dentry *dentry) 
__releases(dentry->d_lock) 
__releases(dcache_lock) 

struct dentry *parent; 
 
list_del(&dentry->d_u.d_child);/*删除子目录*/ 
dentry_stat.nr_dentry--;/*更新统计数据*/  /* For d_free, below */ 
/*drops the locks, at that point nobody can reach this dentry */ 
dentry_iput(dentry);/*"释放"inode*/ 
if (IS_ROOT(dentry)) 
parent = NULL; 
else 
parent = dentry->d_parent; 
d_free(dentry);/*释放dentry*/ 
return parent; 
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */ 
 /*释放inode*/ 
static void dentry_iput(struct dentry * dentry) 
__releases(dentry->d_lock) 
__releases(dcache_lock) 

struct inode *inode = dentry->d_inode; 
if (inode) { 
dentry->d_inode = NULL; 
list_del_init(&dentry->d_alias);/*从同一索引节点目录链表中删除*/ 
spin_unlock(&dentry->d_lock); 
spin_unlock(&dcache_lock); 
if (!inode->i_nlink)/*如果inode没有硬链接*/ 
fsnotify_inoderemove(inode); 
if (dentry->d_op && dentry->d_op->d_iput) 
dentry->d_op->d_iput(dentry, inode); 
else 
iput(inode);/*释放inode*/ 
} else { 
spin_unlock(&dentry->d_lock); 
spin_unlock(&dcache_lock); 

}


2.注册inode cache shrinker

Start_kernel()->vfs_caches_init()->inode_init()->register_shrinker(&icache_shrinker);

其中参数为下面定义
static struct shrinker icache_shrinker = { 
.shrink = shrink_icache_memory, 
.seeks = DEFAULT_SEEKS, 
}; 

static int shrink_icache_memory(int nr, gfp_t gfp_mask) 

if (nr) { 
/*
 * Nasty deadlock avoidance.  We may hold various FS locks,
 * and we don't want to recurse into the FS that called us
 * in clear_inode() and friends..
 */ 
if (!(gfp_mask & __GFP_FS)) 
return -1; 
prune_icache(nr); 

return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 

/*
 * Scan `goal' inodes on the unused list for freeable ones. They are moved to
 * a temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  We expect the final iput() on that inode to add it to
 * the front of the inode_unused list.  So look for it there and if the
 * inode is still freeable, proceed.  The right inode is found 99.9% of the
 * time in testing on a 4-way.
 *
 * If the inode has metadata buffers attached to mapping->private_list then
 * try to remove them.
 */ 
static void prune_icache(int nr_to_scan) 

LIST_HEAD(freeable);/*初始化freeable,在下面需要用到,作为临时存放可被释放的inode*/ 
int nr_pruned = 0; 
int nr_scanned; 
unsigned long reap = 0; 
 
down_read(&iprune_sem); 
spin_lock(&inode_lock); 
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 
struct inode *inode; 
 
if (list_empty(&inode_unused)) 
break; 
 
inode = list_entry(inode_unused.prev, struct inode, i_list); 
 
if (inode->i_state || atomic_read(&inode->i_count)) { 
/*将ionde从inode_unused链表中删除,加入inode_unused链表头*/ 
list_move(&inode->i_list, &inode_unused); 
continue; 

if (inode_has_buffers(inode) || inode->i_data.nrpages) { 
__iget(inode);/*移动到使用链表*/ 
spin_unlock(&inode_lock); 
if (remove_inode_buffers(inode))/*从buffer链表中删除所有buffer*/ 
reap += invalidate_mapping_pages(&inode->i_data, 
0, -1); 
iput(inode); 
spin_lock(&inode_lock); 
 
if (inode != list_entry(inode_unused.next, 
struct inode, i_list)) 
continue;   /* wrong inode or list_empty */ 
if (!can_unuse(inode)) 
continue; 

/*移动到freeable链表*/ 
list_move(&inode->i_list, &freeable); 
WARN_ON(inode->i_state & I_NEW); 
inode->i_state |= I_FREEING; 
nr_pruned++;/*统计移动到freeable链表的元素个数*/ 

inodes_stat.nr_unused -= nr_pruned;/*更新统计数据*/ 
if (current_is_kswapd()) 
__count_vm_events(KSWAPD_INODESTEAL, reap); 
else 
__count_vm_events(PGINODESTEAL, reap); 
spin_unlock(&inode_lock); 
 
dispose_list(&freeable);/*将freeable链表中的数据处理掉*/ 
up_read(&iprune_sem); 
}


3,注册文件描述符表释放函数

/*文件描述符表*/ 
struct fdtable { 
unsigned int max_fds;/*进程能够处理的最大file结构*/ 
struct file ** fd;/*所有打开文件信息*//* current fd array */ 
fd_set *close_on_exec;/*exec系统调用被关闭的所有文件集合*/ 
fd_set *open_fds;/*当前打开的所有文件集合*/ 
struct rcu_head rcu; 
struct fdtable *next; 
}; 

Start_kernel()->vfs_caches_init()->files_init()->files_defer_init()->fdtable_defer_list_init()->INIT_WORK(&fddef->wq, free_fdtable_work);

static void free_fdtable_work(struct work_struct *work) 

struct fdtable_defer *f = 
container_of(work, struct fdtable_defer, wq); 
struct fdtable *fdt; 
 
spin_lock_bh(&f->lock); 
fdt = f->next; 
f->next = NULL; 
spin_unlock_bh(&f->lock); 
while(fdt) {/*释放工作*/ 
struct fdtable *next = fdt->next; 
vfree(fdt->fd); 
free_fdset(fdt); 
kfree(fdt); 
fdt = next; 

}
 

4.sysfs文件系统初始化

Start_kernel()->vfs_caches_init()->mnt_init()->sysfs_init()

int __init sysfs_init(void) 

int err = -ENOMEM; 
 
sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", 
  sizeof(struct sysfs_dirent), 
  0, 0, NULL); 
if (!sysfs_dir_cachep) 
goto out; 
/*初始化sysfs的backing_dev_info结构*/ 
err = sysfs_inode_init(); 
if (err) 
goto out_err; 
/*注册文件系统*/ 
err = register_filesystem(&sysfs_fs_type); 
if (!err) { 
/*创建sysfs mount*/ 
sysfs_mount = kern_mount(&sysfs_fs_type); 
if (IS_ERR(sysfs_mount)) { 
printk(KERN_ERR "sysfs: could not mount!\n"); 
err = PTR_ERR(sysfs_mount); 
sysfs_mount = NULL; 
unregister_filesystem(&sysfs_fs_type); 
goto out_err; 

} else 
goto out_err; 
out: 
return err; 
out_err: 
kmem_cache_destroy(sysfs_dir_cachep); 
sysfs_dir_cachep = NULL; 
goto out; 
}