您现在的位置是：首页 > 其他

当前栏目

Linux|IO|File IO源码剖析

文件源码

2023-03-15 22:02:22 时间

文件的open、close、read、write是最基本的文件抽象，描述了对于设备的操作。本文将结合用户态的接口以及内核态的实现剖析文件IO。

Reference: The Linux Programming Interface: Chapter 4/14/15, Kernel/fs

通用接口

通用IO包含open/read/write/close，大部分文件系统和设备驱动都支持对应接口(或者iter版本)

Open

接口

open用于创建或打开VFS路径下的文件并且获得fd

pathname为vfs文件路径
flags为标志
mode则具体描述了O_CREATE下的文件权限，平时可省略。

分为三个部分:

文件访问模式标志 - 互斥，不可位或。通过fcntl(F_GETFL)可读。
文件创建标志 - 创建以及后续IO的选项，不可读写。
文件状态标志 - IO的方式，可读可写

实现

在fs/open.c和fs/namei.c中可见相关部分代码，省略了次要代码，保留关键路径

syscall - 64位syscall默认能打开大文件

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
		umode_t, mode)
{
	if (force_o_largefile())
		flags |= O_LARGEFILE;
	return do_sys_open(dfd, filename, flags, mode);
}

do_sys_open - 通过flags和mode构建open_how

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
	struct open_how how = build_open_how(flags, mode);
	return do_sys_openat2(dfd, filename, &how);
}

do_sys_openat2 - 通过open_how获取open_flags

static long do_sys_openat2(int dfd, const char __user *filename,
			   struct open_how *how)
{
	struct open_flags op;
	int fd = build_open_flags(how, &op);
	tmp = getname(filename);
	struct file *f = do_filp_open(dfd, tmp, &op);
	putname(tmp);
	return fd;
}

do_filp_open - 设置查找上下文

struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op)
{
	struct nameidata nd;
	int flags = op->lookup_flags;
	struct file *filp;
	set_nameidata(&nd, dfd, pathname);
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
	restore_nameidata();
	return filp;
}

path_openat - 进行名称查找

static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
{
	while (!(error = link_path_walk(s, nd)) &&
		     (s = open_last_lookups(nd, file, op)) != NULL);
	if (!error)
		error = do_open(nd, file, op);
		terminate_walk(nd);
	}
}

do_open - 如果当前没有打开文件，则进行打开，需要处理truncate的情况

static int do_open(struct nameidata *nd,
		   struct file *file, const struct open_flags *op)
{
	error = may_open(&nd->path, acc_mode, open_flag);
	if (!error && !(file->f_mode & FMODE_OPENED))
		error = vfs_open(&nd->path, file);
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
		error = handle_truncate(file);
	if (do_truncate)
		mnt_drop_write(nd->path.mnt);
	return error;
}

vfs_open - VFS打开文件

int vfs_open(const struct path *path, struct file *file)
{
	file->f_path = *path;
	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}

do_dentry_open - 利用实际文件系统或者驱动的open函数打开，增加引用计数，结合文件本身设定的权限f_mode以及文件系统提供的权限f_op获得实际权限。

static int do_dentry_open(struct file *f,
			  struct inode *inode,
			  int (*open)(struct inode *, struct file *))
{
	/* normally all 3 are set; ->open() can clear them if needed */
	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
	if (!open)
		open = f->f_op->open;
	if (open) {
		error = open(inode, f);
		if (error)
			goto cleanup_all;
	}
	f->f_mode |= FMODE_OPENED;
	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
		i_readcount_inc(inode);
	if ((f->f_mode & FMODE_READ) &&
	     likely(f->f_op->read || f->f_op->read_iter))
		f->f_mode |= FMODE_CAN_READ;
	if ((f->f_mode & FMODE_WRITE) &&
	     likely(f->f_op->write || f->f_op->write_iter))
		f->f_mode |= FMODE_CAN_WRITE;
}

Read

接口

read用于从当前的文件偏移量处读取一定数目的字节

fd为文件描述符
count为最大读取字节数，最大为MAX_RW_COUNT
buffer为用户态缓冲区

需要注意的是，read并不遵循C语言结尾的约定，因此应该显式在buffer末尾增加，buffer size >= count + 1

实现

在fs/read_write.c中可见相关部分代码，省略了次要代码，保留关键路径

syscall - 不说了

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	return ksys_read(fd, buf, count);
}

ksys_read - 根据文件描述符获取文件的偏移量

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos;
			ppos = &pos;
		}
		ret = vfs_read(f.file, buf, count, ppos);
		if (ret >= 0 && ppos)
			f.file->f_pos = pos;
		fdput_pos(f);
	}
	return ret;
}

vfs_read - 先进行校验读取是否合法，然后看文件系统或者驱动有没有提供read接口，否则通过read_iter(stuct kiocb *iocb, struct iov_iter *to)读取。

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;
	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;
	ret = rw_verify_area(READ, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;
	if (file->f_op->read)
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		ret = new_sync_read(file, buf, count, pos);
	else
		ret = -EINVAL;
}

Write

接口

write用于从当前的文件偏移量处写入一定数目的字节

fd为文件描述符
count为最大写入字节数，最大为MAX_RW_COUNT
buffer为用户态缓冲区

需要注意的是，我们在使用C++时往往会使用flush，用于刷新缓冲区。问题在于，这个操作仅仅是刷新用户态的缓冲区！内核依然会对write进行缓存,需要手动进行刷新，通过vfs调用磁盘驱动提供的flush原语。

fsync(fd)强制其刷新到磁盘上
fdatasync(fd)不刷新metadata的时间戳
sync()刷新所有的缓冲区(Linux要求等待所有操作完成才能返回)。

因为这个原因，写操作并不能实时的进行持久化，需要linux使用journal机制来保证文件系统的崩溃一致性，然而journal机制本身又需要进行flush。我们必须保证JC写入前，Data和JM 都已经被写入磁盘。同时保证Metadata写入前，JC被写入磁盘。

SOSP13有人提出了乐观的崩溃一致性，减少了Flush的开销。

实现

在fs/read_write.c中可见相关部分代码，省略了次要代码，保留关键路径

和read的逻辑一模一样，区别在于vfs_write使用临界区保护防止race condition

file_start_write(file);
	if (file->f_op->write)
		ret = file->f_op->write(file, buf, count, pos);
	else if (file->f_op->write_iter)
		ret = new_sync_write(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_modify(file);
		add_wchar(current, ret);
	}
	inc_syscw(current);
	file_end_write(file);

Close

接口

close用于释放文件描述符资源和关闭文件，进程结束时也会自动释放。

实现

在fs/open.c和fs/file.c中可见相关部分代码，省略了次要代码，保留关键路径

syscall - 做Retval的转换

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
	int retval = __close_fd(current->files, fd);
	/* can't restart close syscall because file table entry was cleared */
	if (unlikely(retval == -ERESTARTSYS ||
		     retval == -ERESTARTNOINTR ||
		     retval == -ERESTARTNOHAND ||
		     retval == -ERESTART_RESTARTBLOCK))
		retval = -EINTR;
	return retval;
}

_close_fd - 从当前的文件中取出fd表,在fd索引处获取文件并且RCU地赋值为null,然后归还fd资源。此时已经离开临界区，原进程能够利用fd了。然后filp_close关闭文件。

这里需要注意，先释放fd资源，再释放文件资源。

/*
 * The same warnings as for __alloc_fd()/__fd_install() apply here...
 */
int __close_fd(struct files_struct *files, unsigned fd)
{
	struct file *file;
	struct fdtable *fdt;

	spin_lock(&files->file_lock);
	fdt = files_fdtable(files);
	if (fd >= fdt->max_fds)
		goto out_unlock;
	file = fdt->fd[fd];
	if (!file)
		goto out_unlock;
	rcu_assign_pointer(fdt->fd[fd], NULL);
	__put_unused_fd(files, fd);
	spin_unlock(&files->file_lock);
	return filp_close(file, files);

out_unlock:
	spin_unlock(&files->file_lock);
	return -EBADF;
}

flip_close - 调用flush将文件缓冲全部刷新，然后释放当前文件(引用计数--)

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
	int retval = 0;
	if (!file_count(filp)) {
		printk(KERN_ERR "VFS: Close: file count is 0
");
		return 0;
	}
	if (filp->f_op->flush)
		retval = filp->f_op->flush(filp, id);
	if (likely(!(filp->f_mode & FMODE_PATH))) {
		dnotify_flush(filp, id);
		locks_remove_posix(filp, id);
	}
	fput(filp);
	return retval;
}

Lseek

文件的读写共用相同的pos，在读写时自动从内核的文件状态中获取

whence表示参考基

SEEK_SET 以文件头部为基点
SEEK_CUR 以当前偏移量为基点
SEEK_END 以文件尾部为基点

显然，对于没有头部的文件，lseek显然不可行。lseek的适用范围是块设备。

通过间接层处理空洞 - 当我们进行SEEK_END时，END到当前的pos会存在空洞，那么Linux并不会为空洞分配block存储，空洞通过为inode系统中的指针打上标记0表明其并未指向实际磁盘块即可。这个思想和多级页表是相同的。

通过压缩处理空洞 - 类似的，在bitmap中也有可能存在空洞，谷歌的EWAH Compressed Bitmap就采取了压缩的方式将连续的1/0压缩成length + 1/0。

Ioctl

非通用的IO操作，通过指定的request值表示操作，后续传递参数的类型通过request的值进行解释。

总结

系统调用都通过VFS层来进行文件，而实际的操作通过背后的设备驱动完成。

fd的本质是进程fdt->fd的索引，元素为内核中的file结构体，存储打开文件的状态。

由内核在file中隐式维护偏移量，并在读写时自动更新。

read/write操作不一定直接调用read/write，可能是iter；write操作也无法保证实时更新到磁盘上。

close时，文件描述符的释放先进行，然后才进行实际文件的释放。

猜你喜欢

Python中的函数与方法以及Bound Method和Unbound Method
从本体论开始说起——运营商关系图谱的构建及应用
一篇运维老司机的大数据平台监控宝典（2）-联通大数据集群平台监控体系详解
一篇运维老司机的大数据平台监控宝典（1）-联通大数据集群平台监控体系进程详解
Flask中的请求上下文和应用上下文
深入探讨Java中的异常与错误处理
研究学习Kotlin的一些方法
如何成为一名数据科学家？
金融服务领域的大数据：即时分析
影响大数据、机器学习和人工智能未来发展的8个因素
从未见过的堂兄杀了人，你的DNA是关键证据
一文贯通python文件读取
数据显示Java热度持续下落，日子屈指可数？
从0开始构建一个属于你自己的PHP框架
如何将Hadoop集成到工作流程中？这6个优秀实践必看
2017年5月编程语言排行榜：Java与C语言优势正开始缩小
SEO公司使用大数据优化其模型的5种方法
Java多线程之内置锁与显示锁
关于Web Workers你需要了解的七件事
20个安全可靠的免费数据源，各领域数据任你挑

zl程序教程