open系统调用

open()在Linux内核的实现(2)-路径查找 详解应用层open函数如何调用到底层驱动中xxx_open函数 打开一个文件操作系统做了什么?

一、总述 #

二、代码流程 #

1. SYSCALL_DEFINE3 从系统调用定义开始 #

 1// fs/open.c
 2SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 3{
 4	if (force_o_largefile())
 5		flags |= O_LARGEFILE;
 6	return do_sys_open(AT_FDCWD, filename, flags, mode);
 7}
 8
 9// fs/open.c
10long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
11{
12	struct open_how how = build_open_how(flags, mode);
13	return do_sys_openat2(dfd, filename, &how);
14}

1.1. 先构造如何打开的结构体,主要是处理flag #

 1// fs/open.c
 2#define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
 3#define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
 4
 5inline struct open_how build_open_how(int flags, umode_t mode)
 6{
 7	struct open_how how = {
 8		.flags = flags & VALID_OPEN_FLAGS,
 9		.mode = mode & S_IALLUGO,
10	};
11
12	/* O_PATH beats everything else. */
13	if (how.flags & O_PATH)
14		how.flags &= O_PATH_FLAGS;
15	/* Modes should only be set for create-like flags. */
16	if (!WILL_CREATE(how.flags))
17		how.mode = 0;
18	return how;
19}

2. do_sys_openat2 继续调用打开文件 #

 1// fs/open.c
 2static long do_sys_openat2(int dfd, const char __user *filename,
 3			   struct open_how *how)
 4{
 5	struct open_flags op;
 6    // 根据open_how构建open_flags结构体,这里fd只是复用为返回值判断用
 7	int fd = build_open_flags(how, &op);
 8	struct filename *tmp;
 9
10	if (fd)
11		return fd;
12
13	tmp = getname(filename);
14	if (IS_ERR(tmp))
15		return PTR_ERR(tmp);
16
17    // 获取一个未使用的fd
18	fd = get_unused_fd_flags(how->flags);
19	if (fd >= 0) {
20        // 真正打开文件的函数
21		struct file *f = do_filp_open(dfd, tmp, &op);
22		if (IS_ERR(f)) {
23			put_unused_fd(fd);
24			fd = PTR_ERR(f);
25		} else {
26			fsnotify_open(f);
27            // 绑定fd和文件结构体
28			fd_install(fd, f);
29		}
30	}
31	putname(tmp);
32	return fd;
33}

2.1. build_open_flags 构建open_flags #

  1// fs/open.c
  2inline int build_open_flags(const struct open_how *how, struct open_flags *op)
  3{
  4	u64 flags = how->flags;
  5	u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
  6	int lookup_flags = 0;
  7	int acc_mode = ACC_MODE(flags);
  8
  9	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
 10			 "struct open_flags doesn't yet handle flags > 32 bits");
 11
 12	/*
 13	 * Strip flags that either shouldn't be set by userspace like
 14	 * FMODE_NONOTIFY or that aren't relevant in determining struct
 15	 * open_flags like O_CLOEXEC.
 16	 */
 17	flags &= ~strip;
 18
 19	/*
 20	 * Older syscalls implicitly clear all of the invalid flags or argument
 21	 * values before calling build_open_flags(), but openat2(2) checks all
 22	 * of its arguments.
 23	 */
 24	if (flags & ~VALID_OPEN_FLAGS)
 25		return -EINVAL;
 26	if (how->resolve & ~VALID_RESOLVE_FLAGS)
 27		return -EINVAL;
 28
 29	/* Scoping flags are mutually exclusive. */
 30	if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
 31		return -EINVAL;
 32
 33	/* Deal with the mode. */
 34	if (WILL_CREATE(flags)) {
 35		if (how->mode & ~S_IALLUGO)
 36			return -EINVAL;
 37		op->mode = how->mode | S_IFREG;
 38	} else {
 39		if (how->mode != 0)
 40			return -EINVAL;
 41		op->mode = 0;
 42	}
 43
 44	/*
 45	 * In order to ensure programs get explicit errors when trying to use
 46	 * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
 47	 * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
 48	 * have to require userspace to explicitly set it.
 49	 */
 50	if (flags & __O_TMPFILE) {
 51		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
 52			return -EINVAL;
 53		if (!(acc_mode & MAY_WRITE))
 54			return -EINVAL;
 55	}
 56	if (flags & O_PATH) {
 57		/* O_PATH only permits certain other flags to be set. */
 58		if (flags & ~O_PATH_FLAGS)
 59			return -EINVAL;
 60		acc_mode = 0;
 61	}
 62
 63	/*
 64	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
 65	 * check for O_DSYNC if the need any syncing at all we enforce it's
 66	 * always set instead of having to deal with possibly weird behaviour
 67	 * for malicious applications setting only __O_SYNC.
 68	 */
 69	if (flags & __O_SYNC)
 70		flags |= O_DSYNC;
 71
 72	op->open_flag = flags;
 73
 74	/* O_TRUNC implies we need access checks for write permissions */
 75	if (flags & O_TRUNC)
 76		acc_mode |= MAY_WRITE;
 77
 78	/* Allow the LSM permission hook to distinguish append
 79	   access from general write access. */
 80	if (flags & O_APPEND)
 81		acc_mode |= MAY_APPEND;
 82
 83	op->acc_mode = acc_mode;
 84
 85	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
 86
 87	if (flags & O_CREAT) {
 88		op->intent |= LOOKUP_CREATE;
 89		if (flags & O_EXCL) {
 90			op->intent |= LOOKUP_EXCL;
 91			flags |= O_NOFOLLOW;
 92		}
 93	}
 94
 95	if (flags & O_DIRECTORY)
 96		lookup_flags |= LOOKUP_DIRECTORY;
 97	if (!(flags & O_NOFOLLOW))
 98		lookup_flags |= LOOKUP_FOLLOW;
 99
100	if (how->resolve & RESOLVE_NO_XDEV)
101		lookup_flags |= LOOKUP_NO_XDEV;
102	if (how->resolve & RESOLVE_NO_MAGICLINKS)
103		lookup_flags |= LOOKUP_NO_MAGICLINKS;
104	if (how->resolve & RESOLVE_NO_SYMLINKS)
105		lookup_flags |= LOOKUP_NO_SYMLINKS;
106	if (how->resolve & RESOLVE_BENEATH)
107		lookup_flags |= LOOKUP_BENEATH;
108	if (how->resolve & RESOLVE_IN_ROOT)
109		lookup_flags |= LOOKUP_IN_ROOT;
110	if (how->resolve & RESOLVE_CACHED) {
111		/* Don't bother even trying for create/truncate/tmpfile open */
112		if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
113			return -EAGAIN;
114		lookup_flags |= LOOKUP_CACHED;
115	}
116
117	op->lookup_flags = lookup_flags;
118	return 0;
119}

3. do_filp_open 打开文件的调用 #

  • path_openat会申请file结构体的空间,用于返回
  • link_path_walk找路径的最后一个分量
  • open_last_lookups对最后一个分量进行处理,会查找文件是否存在,不存在则根据条件创建
 1// fs/namei.c
 2struct file *do_filp_open(int dfd, struct filename *pathname,
 3		const struct open_flags *op)
 4{
 5	struct nameidata nd;
 6	int flags = op->lookup_flags;
 7	struct file *filp;
 8
 9	set_nameidata(&nd, dfd, pathname, NULL);
10	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
11	if (unlikely(filp == ERR_PTR(-ECHILD)))
12		filp = path_openat(&nd, op, flags);
13	if (unlikely(filp == ERR_PTR(-ESTALE)))
14		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
15	restore_nameidata();
16	return filp;
17}
18
19// fs/namei.c
20static struct file *path_openat(struct nameidata *nd,
21			const struct open_flags *op, unsigned flags)
22{
23	struct file *file;
24	int error;
25
26    // 申请空间
27	file = alloc_empty_file(op->open_flag, current_cred());
28	if (IS_ERR(file))
29		return file;
30
31	if (unlikely(file->f_flags & __O_TMPFILE)) {
32		error = do_tmpfile(nd, flags, op, file);
33	} else if (unlikely(file->f_flags & O_PATH)) {
34		error = do_o_path(nd, flags, file);
35	} else {
36		const char *s = path_init(nd, flags);
37		// link_path_walk找路径的最后一个分量
38		// open_last_lookups对最后一个分量进行处理,会查找文件是否存在,不存在则根据条件创建
39		while (!(error = link_path_walk(s, nd)) &&
40		       (s = open_last_lookups(nd, file, op)) != NULL)
41			;
42		if (!error)
43			// 遍历到后,执行open的后续操作
44			error = do_open(nd, file, op);
45		terminate_walk(nd);
46	}
47	if (likely(!error)) {
48		if (likely(file->f_mode & FMODE_OPENED))
49			return file;
50		WARN_ON(1);
51		error = -EINVAL;
52	}
53    // 此函数减少文件引用计数
54	fput(file);
55	if (error == -EOPENSTALE) {
56		if (flags & LOOKUP_RCU)
57			error = -ECHILD;
58		else
59			error = -ESTALE;
60	}
61	return ERR_PTR(error);
62}

4. do_open 打开文件的最后一步 #

 1// fs/namei.c
 2/*
 3 * Handle the last step of open()
 4 */
 5static int do_open(struct nameidata *nd,
 6		   struct file *file, const struct open_flags *op)
 7{
 8	struct user_namespace *mnt_userns;
 9	int open_flag = op->open_flag;
10	bool do_truncate;
11	int acc_mode;
12	int error;
13
14	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
15		error = complete_walk(nd);
16		if (error)
17			return error;
18	}
19	if (!(file->f_mode & FMODE_CREATED))
20		audit_inode(nd->name, nd->path.dentry, 0);
21	mnt_userns = mnt_user_ns(nd->path.mnt);
22	if (open_flag & O_CREAT) {
23		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
24			return -EEXIST;
25		if (d_is_dir(nd->path.dentry))
26			return -EISDIR;
27		error = may_create_in_sticky(mnt_userns, nd,
28					     d_backing_inode(nd->path.dentry));
29		if (unlikely(error))
30			return error;
31	}
32	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
33		return -ENOTDIR;
34
35	do_truncate = false;
36	acc_mode = op->acc_mode;
37	if (file->f_mode & FMODE_CREATED) {
38		/* Don't check for write permission, don't truncate */
39		open_flag &= ~O_TRUNC;
40		acc_mode = 0;
41	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
42		error = mnt_want_write(nd->path.mnt);
43		if (error)
44			return error;
45		do_truncate = true;
46	}
47	error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
48	if (!error && !(file->f_mode & FMODE_OPENED))
49		error = vfs_open(&nd->path, file);
50	if (!error)
51		error = ima_file_check(file, op->acc_mode);
52	if (!error && do_truncate)
53		error = handle_truncate(mnt_userns, file);
54	if (unlikely(error > 0)) {
55		WARN_ON(1);
56		error = -EINVAL;
57	}
58	if (do_truncate)
59		mnt_drop_write(nd->path.mnt);
60	return error;
61}

5. vfs_open 虚拟文件系统打开文件 #

  • 设置文件操作集
  • 调用操作对应的open函数
  1// fs/open.c
  2/**
  3 * vfs_open - open the file at the given path
  4 * @path: path to open
  5 * @file: newly allocated file with f_flag initialized
  6 * @cred: credentials to use
  7 */
  8int vfs_open(const struct path *path, struct file *file)
  9{
 10	file->f_path = *path;
 11	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
 12}
 13
 14// fs/open.c
 15static int do_dentry_open(struct file *f,
 16			  struct inode *inode,
 17			  int (*open)(struct inode *, struct file *))
 18{
 19	static const struct file_operations empty_fops = {};
 20	int error;
 21
 22	path_get(&f->f_path);
 23	f->f_inode = inode;
 24	f->f_mapping = inode->i_mapping;
 25	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
 26	f->f_sb_err = file_sample_sb_err(f);
 27
 28	if (unlikely(f->f_flags & O_PATH)) {
 29		f->f_mode = FMODE_PATH | FMODE_OPENED;
 30		f->f_op = &empty_fops;
 31		return 0;
 32	}
 33
 34	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 35		error = get_write_access(inode);
 36		if (unlikely(error))
 37			goto cleanup_file;
 38		error = __mnt_want_write(f->f_path.mnt);
 39		if (unlikely(error)) {
 40			put_write_access(inode);
 41			goto cleanup_file;
 42		}
 43		f->f_mode |= FMODE_WRITER;
 44	}
 45
 46	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
 47	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
 48		f->f_mode |= FMODE_ATOMIC_POS;
 49
 50	// 设置文件相关的方法
 51	f->f_op = fops_get(inode->i_fop);
 52	if (WARN_ON(!f->f_op)) {
 53		error = -ENODEV;
 54		goto cleanup_all;
 55	}
 56
 57	error = security_file_open(f);
 58	if (error)
 59		goto cleanup_all;
 60
 61	error = break_lease(locks_inode(f), f->f_flags);
 62	if (error)
 63		goto cleanup_all;
 64
 65	/* normally all 3 are set; ->open() can clear them if needed */
 66	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 67	// 这里找对应的f_op中的open进行调用
 68	if (!open)
 69		open = f->f_op->open;
 70	if (open) {
 71		error = open(inode, f);
 72		if (error)
 73			goto cleanup_all;
 74	}
 75	// 设置文件打开标记
 76	f->f_mode |= FMODE_OPENED;
 77	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 78		i_readcount_inc(inode);
 79	if ((f->f_mode & FMODE_READ) &&
 80	     likely(f->f_op->read || f->f_op->read_iter))
 81		f->f_mode |= FMODE_CAN_READ;
 82	if ((f->f_mode & FMODE_WRITE) &&
 83	     likely(f->f_op->write || f->f_op->write_iter))
 84		f->f_mode |= FMODE_CAN_WRITE;
 85	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
 86		f->f_mode |= FMODE_CAN_ODIRECT;
 87
 88	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 89
 90	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 91
 92	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
 93		return -EINVAL;
 94
 95	/*
 96	 * XXX: Huge page cache doesn't support writing yet. Drop all page
 97	 * cache for this file before processing writes.
 98	 */
 99	if (f->f_mode & FMODE_WRITE) {
100		/*
101		 * Paired with smp_mb() in collapse_file() to ensure nr_thps
102		 * is up to date and the update to i_writecount by
103		 * get_write_access() is visible. Ensures subsequent insertion
104		 * of THPs into the page cache will fail.
105		 */
106		smp_mb();
107		if (filemap_nr_thps(inode->i_mapping)) {
108			struct address_space *mapping = inode->i_mapping;
109
110			filemap_invalidate_lock(inode->i_mapping);
111			/*
112			 * unmap_mapping_range just need to be called once
113			 * here, because the private pages is not need to be
114			 * unmapped mapping (e.g. data segment of dynamic
115			 * shared libraries here).
116			 */
117			unmap_mapping_range(mapping, 0, 0, 0);
118			truncate_inode_pages(mapping, 0);
119			filemap_invalidate_unlock(inode->i_mapping);
120		}
121	}
122
123	return 0;
124
125cleanup_all:
126	if (WARN_ON_ONCE(error > 0))
127		error = -EINVAL;
128	fops_put(f->f_op);
129	if (f->f_mode & FMODE_WRITER) {
130		put_write_access(inode);
131		__mnt_drop_write(f->f_path.mnt);
132	}
133cleanup_file:
134	path_put(&f->f_path);
135	f->f_path.mnt = NULL;
136	f->f_path.dentry = NULL;
137	f->f_inode = NULL;
138	return error;
139}