这部分主要对linux虚拟文件系统内核初始化部分做些补充。
关于shrinker,inode和dentry cache初始化阶段都需要注册自己的shrinker,用于缩减cache。两个操作原理类似。
shrinker数据结构介绍
/*
* A callback you can register to apply pressure to ageable caches.
*
* 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
* look through the least-recently-used 'nr_to_scan' entries and
* attempt to free them up. It should return the number of objects
* which remain in the cache. If it returns -1, it means it cannot do
* any scanning at this time (eg. there is a risk of deadlock).
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
*
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
* querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrinker {
int (*shrink)(int nr_to_scan, gfp_t gfp_mask);
int seeks; /* seeks to recreate an obj */
/* These are for internal use */
struct list_head list;
long nr;/* objs pending delete */
};
1,注册inode cache shrinker
Start_kernel()->vfs_caches_init()->dcache_init()->register_shrinker(&dcache_shrinker);
/*
* Add a shrinker callback to be called from the vm
*/
void register_shrinker(struct shrinker *shrinker)
{
shrinker->nr = 0;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
}
其中相关的函数在这里定义。
static struct shrinker dcache_shrinker = {
.shrink = shrink_dcache_memory,
.seeks = DEFAULT_SEEKS,
};
/*
* Scan `nr' dentries and return the number which remain.
*
* We need to avoid reentering the filesystem if the caller is performing a
* GFP_NOFS allocation attempt. One example deadlock is:
*
* ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
* prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
* ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
*
* In this case we return -1 to tell the caller that we baled.
*/
static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
{
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
prune_dcache(nr);/*缩减指定大小的cache*/
}
return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
/**
* prune_dcache - shrink the dcache
* @count: number of entries to try to free
*
* Shrink the dcache. This is done when we need more memory, or simply when we
* need to unmount something (at which point we need to unuse all dentries).
*
* This function may fail to free any resources if all the dentries are in use.
*/
/*缩减dcache,count为释放的数量*/
static void prune_dcache(int count)
{
struct super_block *sb;
int w_count;
int unused = dentry_stat.nr_unused;
int prune_ratio;
int pruned;
if (unused == 0 || count == 0)
return;
spin_lock(&dcache_lock);
restart:
if (count >= unused)
prune_ratio = 1;/*释放率*/
else
prune_ratio = unused / count;
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_nr_dentry_unused == 0)
continue;
sb->s_count++;
/* Now, we reclaim unused dentrins with fairness.
* We reclaim them same percentage from each superblock.
* We calculate number of dentries to scan on this sb
* as follows, but the implementation is arranged to avoid
* overflows:
* number of dentries to scan on this sb =
* count * (number of dentries on this sb /
* number of dentries in the machine)
*/
spin_unlock(&sb_lock);
/*重新利用释放率计算释放量*/
if (prune_ratio != 1)
w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
else
w_count = sb->s_nr_dentry_unused;
pruned = w_count;
/*
* We need to be sure this filesystem isn't being unmounted,
* otherwise we could race with generic_shutdown_super(), and
* end up holding a reference to an inode while the filesystem
* is unmounted. So we try to get s_umount, and make sure
* s_root isn't NULL.
*/
if (down_read_trylock(&sb->s_umount)) {
if ((sb->s_root != NULL) &&
(!list_empty(&sb->s_dentry_lru))) {
spin_unlock(&dcache_lock);
/*实际释放工作*/
__shrink_dcache_sb(sb, &w_count,
DCACHE_REFERENCED);
pruned -= w_count;
spin_lock(&dcache_lock);
}
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
count -= pruned;
/*
* restart only when sb is no longer on the list and
* we have more work to do.
*/
if (__put_super_and_need_restart(sb) && count > 0) {
spin_unlock(&sb_lock);
goto restart;
}
}
spin_unlock(&sb_lock);
spin_unlock(&dcache_lock);
}
/*
* Shrink the dentry LRU on a given superblock.
* @sb : superblock to shrink dentry LRU.
* @count: If count is NULL, we prune all dentries on superblock.
* @flags: If flags is non-zero, we need to do special processing based on
* which flags are set. This means we don't need to maintain multiple
* similar copies of this loop.
*/
static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
{
LIST_HEAD(referenced);
LIST_HEAD(tmp);
struct dentry *dentry;
int cnt = 0;
BUG_ON(!sb);
BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
spin_lock(&dcache_lock);
if (count != NULL)
/* called from prune_dcache() and shrink_dcache_parent() */
cnt = *count;/*在下面用到*/
restart:
if (count == NULL)
list_splice_init(&sb->s_dentry_lru, &tmp);
else {
while (!list_empty(&sb->s_dentry_lru)) {
dentry = list_entry(sb->s_dentry_lru.prev,
struct dentry, d_lru);
BUG_ON(dentry->d_sb != sb);
spin_lock(&dentry->d_lock);
/*
* If we are honouring the DCACHE_REFERENCED flag and
* the dentry has this flag set, don't free it. Clear
* the flag and put it back on the LRU.
*/
/*清flag对应位,将链表元素放LRU尾部*/
if ((flags & DCACHE_REFERENCED)
&& (dentry->d_flags & DCACHE_REFERENCED)) {
dentry->d_flags &= ~DCACHE_REFERENCED;
list_move(&dentry->d_lru, &referenced);
spin_unlock(&dentry->d_lock);
} else {
/*从d_lru链表中删除,加到tmp链表中*/
list_move_tail(&dentry->d_lru, &tmp);
spin_unlock(&dentry->d_lock);
cnt--;/*数量减一*/
if (!cnt)/*减到0跳出循环*/
break;
}
cond_resched_lock(&dcache_lock);
}
}
/*对tmp中的每个元素,其中tmp中的元素为上面移过来的*/
while (!list_empty(&tmp)) {
dentry = list_entry(tmp.prev, struct dentry, d_lru);
/*从tmp中删除相关链表并做重新初始化和数据统计*/
dentry_lru_del_init(dentry);
spin_lock(&dentry->d_lock);
/*
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free
* it - just keep it off the LRU list.
*/
if (atomic_read(&dentry->d_count)) {
spin_unlock(&dentry->d_lock);
continue;
}/*释放dentry和其父dentry*/
prune_one_dentry(dentry);
/* dentry->d_lock was dropped in prune_one_dentry() */
cond_resched_lock(&dcache_lock);
}
if (count == NULL && !list_empty(&sb->s_dentry_lru))
goto restart;
if (count != NULL)
*count = cnt;
if (!list_empty(&referenced))
list_splice(&referenced, &sb->s_dentry_lru);
spin_unlock(&dcache_lock);
}
static void dentry_lru_del_init(struct dentry *dentry)
{
if (likely(!list_empty(&dentry->d_lru))) {
list_del_init(&dentry->d_lru);/*从链表中删除并初始化dentry->d_lru*/
dentry->d_sb->s_nr_dentry_unused--;/*未用数减一*/
dentry_stat.nr_unused--;/*更新统计数据*/
}
}
[html]
/*
* Throw away a dentry - free the inode, dput the parent. This requires that
* the LRU list has already been removed.
*
* Try to prune ancestors as well. This is necessary to prevent
* quadratic behavior of shrink_dcache_parent(), but is also expected
* to be beneficial in reducing dentry cache fragmentation.
*/
static void prune_one_dentry(struct dentry * dentry)
__releases(dentry->d_lock)
__releases(dcache_lock)
__acquires(dcache_lock)
{
__d_drop(dentry);
dentry = d_kill(dentry);/*释放dentry*/
/*
* Prune ancestors. Locking is simpler than in dput(),
* because dcache_lock needs to be taken anyway.
*/
spin_lock(&dcache_lock);
while (dentry) {
if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
return;
if (dentry->d_op && dentry->d_op->d_delete)
dentry->d_op->d_delete(dentry);
dentry_lru_del_init(dentry);
__d_drop(dentry);
dentry = d_kill(dentry);
spin_lock(&dcache_lock);
}
}
/**
* d_kill - kill dentry and return parent
* @dentry: dentry to kill
*
* The dentry must already be unhashed and removed from the LRU.
*
* If this is the root of the dentry tree, return NULL.
*/
static struct dentry *d_kill(struct dentry *dentry)
__releases(dentry->d_lock)
__releases(dcache_lock)
{
struct dentry *parent;
list_del(&dentry->d_u.d_child);/*删除子目录*/
dentry_stat.nr_dentry--;/*更新统计数据*/ /* For d_free, below */
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);/*"释放"inode*/
if (IS_ROOT(dentry))
parent = NULL;
else
parent = dentry->d_parent;
d_free(dentry);/*释放dentry*/
return parent;
}
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
*/
/*释放inode*/
static void dentry_iput(struct dentry * dentry)
__releases(dentry->d_lock)
__releases(dcache_lock)
{
struct inode *inode = dentry->d_inode;
if (inode) {
dentry->d_inode = NULL;
list_del_init(&dentry->d_alias);/*从同一索引节点目录链表中删除*/
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
if (!inode->i_nlink)/*如果inode没有硬链接*/
fsnotify_inoderemove(inode);
if (dentry->d_op && dentry->d_op->d_iput)
dentry->d_op->d_iput(dentry, inode);
else
iput(inode);/*释放inode*/
} else {
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
}
}
2.注册inode cache shrinker
Start_kernel()->vfs_caches_init()->inode_init()->register_shrinker(&icache_shrinker);
其中参数为下面定义
static struct shrinker icache_shrinker = {
.shrink = shrink_icache_memory,
.seeks = DEFAULT_SEEKS,
};
static int shrink_icache_memory(int nr, gfp_t gfp_mask)
{
if (nr) {
/*
* Nasty deadlock avoidance. We may hold various FS locks,
* and we don't want to recurse into the FS that called us
* in clear_inode() and friends..
*/
if (!(gfp_mask & __GFP_FS))
return -1;
prune_icache(nr);
}
return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
/*
* Scan `goal' inodes on the unused list for freeable ones. They are moved to
* a temporary list and then are freed outside inode_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. We expect the final iput() on that inode to add it to
* the front of the inode_unused list. So look for it there and if the
* inode is still freeable, proceed. The right inode is found 99.9% of the
* time in testing on a 4-way.
*
* If the inode has metadata buffers attached to mapping->private_list then
* try to remove them.
*/
static void prune_icache(int nr_to_scan)
{
LIST_HEAD(freeable);/*初始化freeable,在下面需要用到,作为临时存放可被释放的inode*/
int nr_pruned = 0;
int nr_scanned;
unsigned long reap = 0;
down_read(&iprune_sem);
spin_lock(&inode_lock);
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
if (list_empty(&inode_unused))
break;
inode = list_entry(inode_unused.prev, struct inode, i_list);
if (inode->i_state || atomic_read(&inode->i_count)) {
/*将ionde从inode_unused链表中删除,加入inode_unused链表头*/
list_move(&inode->i_list, &inode_unused);
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
__iget(inode);/*移动到使用链表*/
spin_unlock(&inode_lock);
if (remove_inode_buffers(inode))/*从buffer链表中删除所有buffer*/
reap += invalidate_mapping_pages(&inode->i_data,
0, -1);
iput(inode);
spin_lock(&inode_lock);
if (inode != list_entry(inode_unused.next,
struct inode, i_list))
continue; /* wrong inode or list_empty */
if (!can_unuse(inode))
continue;
}
/*移动到freeable链表*/
list_move(&inode->i_list, &freeable);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
nr_pruned++;/*统计移动到freeable链表的元素个数*/
}
inodes_stat.nr_unused -= nr_pruned;/*更新统计数据*/
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
spin_unlock(&inode_lock);
dispose_list(&freeable);/*将freeable链表中的数据处理掉*/
up_read(&iprune_sem);
}
3,注册文件描述符表释放函数
/*文件描述符表*/
struct fdtable {
unsigned int max_fds;/*进程能够处理的最大file结构*/
struct file ** fd;/*所有打开文件信息*//* current fd array */
fd_set *close_on_exec;/*exec系统调用被关闭的所有文件集合*/
fd_set *open_fds;/*当前打开的所有文件集合*/
struct rcu_head rcu;
struct fdtable *next;
};
Start_kernel()->vfs_caches_init()->files_init()->files_defer_init()->fdtable_defer_list_init()->INIT_WORK(&fddef->wq, free_fdtable_work);
static void free_fdtable_work(struct work_struct *work)
{
struct fdtable_defer *f =
container_of(work, struct fdtable_defer, wq);
struct fdtable *fdt;
spin_lock_bh(&f->lock);
fdt = f->next;
f->next = NULL;
spin_unlock_bh(&f->lock);
while(fdt) {/*释放工作*/
struct fdtable *next = fdt->next;
vfree(fdt->fd);
free_fdset(fdt);
kfree(fdt);
fdt = next;
}
}
4.sysfs文件系统初始化
Start_kernel()->vfs_caches_init()->mnt_init()->sysfs_init()
int __init sysfs_init(void)
{
int err = -ENOMEM;
sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
sizeof(struct sysfs_dirent),
0, 0, NULL);
if (!sysfs_dir_cachep)
goto out;
/*初始化sysfs的backing_dev_info结构*/
err = sysfs_inode_init();
if (err)
goto out_err;
/*注册文件系统*/
err = register_filesystem(&sysfs_fs_type);
if (!err) {
/*创建sysfs mount*/
sysfs_mount = kern_mount(&sysfs_fs_type);
if (IS_ERR(sysfs_mount)) {
printk(KERN_ERR "sysfs: could not mount!\n");
err = PTR_ERR(sysfs_mount);
sysfs_mount = NULL;
unregister_filesystem(&sysfs_fs_type);
goto out_err;
}
} else
goto out_err;
out:
return err;
out_err:
kmem_cache_destroy(sysfs_dir_cachep);
sysfs_dir_cachep = NULL;
goto out;
}