open()在Linux内核的实现(2)-路径查找 详解应用层open函数如何调用到底层驱动中xxx_open函数 打开一个文件操作系统做了什么?
一、总述 #
二、代码流程 #
1. SYSCALL_DEFINE3 从系统调用定义开始 #
1// fs/open.c
2SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
3{
4 if (force_o_largefile())
5 flags |= O_LARGEFILE;
6 return do_sys_open(AT_FDCWD, filename, flags, mode);
7}
8
9// fs/open.c
10long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
11{
12 struct open_how how = build_open_how(flags, mode);
13 return do_sys_openat2(dfd, filename, &how);
14}
1.1. 先构造如何打开的结构体,主要是处理flag #
1// fs/open.c
2#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
3#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
4
5inline struct open_how build_open_how(int flags, umode_t mode)
6{
7 struct open_how how = {
8 .flags = flags & VALID_OPEN_FLAGS,
9 .mode = mode & S_IALLUGO,
10 };
11
12 /* O_PATH beats everything else. */
13 if (how.flags & O_PATH)
14 how.flags &= O_PATH_FLAGS;
15 /* Modes should only be set for create-like flags. */
16 if (!WILL_CREATE(how.flags))
17 how.mode = 0;
18 return how;
19}
2. do_sys_openat2 继续调用打开文件 #
1// fs/open.c
2static long do_sys_openat2(int dfd, const char __user *filename,
3 struct open_how *how)
4{
5 struct open_flags op;
6 // 根据open_how构建open_flags结构体,这里fd只是复用为返回值判断用
7 int fd = build_open_flags(how, &op);
8 struct filename *tmp;
9
10 if (fd)
11 return fd;
12
13 tmp = getname(filename);
14 if (IS_ERR(tmp))
15 return PTR_ERR(tmp);
16
17 // 获取一个未使用的fd
18 fd = get_unused_fd_flags(how->flags);
19 if (fd >= 0) {
20 // 真正打开文件的函数
21 struct file *f = do_filp_open(dfd, tmp, &op);
22 if (IS_ERR(f)) {
23 put_unused_fd(fd);
24 fd = PTR_ERR(f);
25 } else {
26 fsnotify_open(f);
27 // 绑定fd和文件结构体
28 fd_install(fd, f);
29 }
30 }
31 putname(tmp);
32 return fd;
33}
2.1. build_open_flags 构建open_flags
#
1// fs/open.c
2inline int build_open_flags(const struct open_how *how, struct open_flags *op)
3{
4 u64 flags = how->flags;
5 u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
6 int lookup_flags = 0;
7 int acc_mode = ACC_MODE(flags);
8
9 BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
10 "struct open_flags doesn't yet handle flags > 32 bits");
11
12 /*
13 * Strip flags that either shouldn't be set by userspace like
14 * FMODE_NONOTIFY or that aren't relevant in determining struct
15 * open_flags like O_CLOEXEC.
16 */
17 flags &= ~strip;
18
19 /*
20 * Older syscalls implicitly clear all of the invalid flags or argument
21 * values before calling build_open_flags(), but openat2(2) checks all
22 * of its arguments.
23 */
24 if (flags & ~VALID_OPEN_FLAGS)
25 return -EINVAL;
26 if (how->resolve & ~VALID_RESOLVE_FLAGS)
27 return -EINVAL;
28
29 /* Scoping flags are mutually exclusive. */
30 if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
31 return -EINVAL;
32
33 /* Deal with the mode. */
34 if (WILL_CREATE(flags)) {
35 if (how->mode & ~S_IALLUGO)
36 return -EINVAL;
37 op->mode = how->mode | S_IFREG;
38 } else {
39 if (how->mode != 0)
40 return -EINVAL;
41 op->mode = 0;
42 }
43
44 /*
45 * In order to ensure programs get explicit errors when trying to use
46 * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
47 * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
48 * have to require userspace to explicitly set it.
49 */
50 if (flags & __O_TMPFILE) {
51 if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
52 return -EINVAL;
53 if (!(acc_mode & MAY_WRITE))
54 return -EINVAL;
55 }
56 if (flags & O_PATH) {
57 /* O_PATH only permits certain other flags to be set. */
58 if (flags & ~O_PATH_FLAGS)
59 return -EINVAL;
60 acc_mode = 0;
61 }
62
63 /*
64 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
65 * check for O_DSYNC if the need any syncing at all we enforce it's
66 * always set instead of having to deal with possibly weird behaviour
67 * for malicious applications setting only __O_SYNC.
68 */
69 if (flags & __O_SYNC)
70 flags |= O_DSYNC;
71
72 op->open_flag = flags;
73
74 /* O_TRUNC implies we need access checks for write permissions */
75 if (flags & O_TRUNC)
76 acc_mode |= MAY_WRITE;
77
78 /* Allow the LSM permission hook to distinguish append
79 access from general write access. */
80 if (flags & O_APPEND)
81 acc_mode |= MAY_APPEND;
82
83 op->acc_mode = acc_mode;
84
85 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
86
87 if (flags & O_CREAT) {
88 op->intent |= LOOKUP_CREATE;
89 if (flags & O_EXCL) {
90 op->intent |= LOOKUP_EXCL;
91 flags |= O_NOFOLLOW;
92 }
93 }
94
95 if (flags & O_DIRECTORY)
96 lookup_flags |= LOOKUP_DIRECTORY;
97 if (!(flags & O_NOFOLLOW))
98 lookup_flags |= LOOKUP_FOLLOW;
99
100 if (how->resolve & RESOLVE_NO_XDEV)
101 lookup_flags |= LOOKUP_NO_XDEV;
102 if (how->resolve & RESOLVE_NO_MAGICLINKS)
103 lookup_flags |= LOOKUP_NO_MAGICLINKS;
104 if (how->resolve & RESOLVE_NO_SYMLINKS)
105 lookup_flags |= LOOKUP_NO_SYMLINKS;
106 if (how->resolve & RESOLVE_BENEATH)
107 lookup_flags |= LOOKUP_BENEATH;
108 if (how->resolve & RESOLVE_IN_ROOT)
109 lookup_flags |= LOOKUP_IN_ROOT;
110 if (how->resolve & RESOLVE_CACHED) {
111 /* Don't bother even trying for create/truncate/tmpfile open */
112 if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
113 return -EAGAIN;
114 lookup_flags |= LOOKUP_CACHED;
115 }
116
117 op->lookup_flags = lookup_flags;
118 return 0;
119}
3. do_filp_open 打开文件的调用 #
- path_openat会申请file结构体的空间,用于返回
- link_path_walk找路径的最后一个分量
- open_last_lookups对最后一个分量进行处理,会查找文件是否存在,不存在则根据条件创建
1// fs/namei.c
2struct file *do_filp_open(int dfd, struct filename *pathname,
3 const struct open_flags *op)
4{
5 struct nameidata nd;
6 int flags = op->lookup_flags;
7 struct file *filp;
8
9 set_nameidata(&nd, dfd, pathname, NULL);
10 filp = path_openat(&nd, op, flags | LOOKUP_RCU);
11 if (unlikely(filp == ERR_PTR(-ECHILD)))
12 filp = path_openat(&nd, op, flags);
13 if (unlikely(filp == ERR_PTR(-ESTALE)))
14 filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
15 restore_nameidata();
16 return filp;
17}
18
19// fs/namei.c
20static struct file *path_openat(struct nameidata *nd,
21 const struct open_flags *op, unsigned flags)
22{
23 struct file *file;
24 int error;
25
26 // 申请空间
27 file = alloc_empty_file(op->open_flag, current_cred());
28 if (IS_ERR(file))
29 return file;
30
31 if (unlikely(file->f_flags & __O_TMPFILE)) {
32 error = do_tmpfile(nd, flags, op, file);
33 } else if (unlikely(file->f_flags & O_PATH)) {
34 error = do_o_path(nd, flags, file);
35 } else {
36 const char *s = path_init(nd, flags);
37 // link_path_walk找路径的最后一个分量
38 // open_last_lookups对最后一个分量进行处理,会查找文件是否存在,不存在则根据条件创建
39 while (!(error = link_path_walk(s, nd)) &&
40 (s = open_last_lookups(nd, file, op)) != NULL)
41 ;
42 if (!error)
43 // 遍历到后,执行open的后续操作
44 error = do_open(nd, file, op);
45 terminate_walk(nd);
46 }
47 if (likely(!error)) {
48 if (likely(file->f_mode & FMODE_OPENED))
49 return file;
50 WARN_ON(1);
51 error = -EINVAL;
52 }
53 // 此函数减少文件引用计数
54 fput(file);
55 if (error == -EOPENSTALE) {
56 if (flags & LOOKUP_RCU)
57 error = -ECHILD;
58 else
59 error = -ESTALE;
60 }
61 return ERR_PTR(error);
62}
4. do_open 打开文件的最后一步 #
1// fs/namei.c
2/*
3 * Handle the last step of open()
4 */
5static int do_open(struct nameidata *nd,
6 struct file *file, const struct open_flags *op)
7{
8 struct user_namespace *mnt_userns;
9 int open_flag = op->open_flag;
10 bool do_truncate;
11 int acc_mode;
12 int error;
13
14 if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
15 error = complete_walk(nd);
16 if (error)
17 return error;
18 }
19 if (!(file->f_mode & FMODE_CREATED))
20 audit_inode(nd->name, nd->path.dentry, 0);
21 mnt_userns = mnt_user_ns(nd->path.mnt);
22 if (open_flag & O_CREAT) {
23 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
24 return -EEXIST;
25 if (d_is_dir(nd->path.dentry))
26 return -EISDIR;
27 error = may_create_in_sticky(mnt_userns, nd,
28 d_backing_inode(nd->path.dentry));
29 if (unlikely(error))
30 return error;
31 }
32 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
33 return -ENOTDIR;
34
35 do_truncate = false;
36 acc_mode = op->acc_mode;
37 if (file->f_mode & FMODE_CREATED) {
38 /* Don't check for write permission, don't truncate */
39 open_flag &= ~O_TRUNC;
40 acc_mode = 0;
41 } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
42 error = mnt_want_write(nd->path.mnt);
43 if (error)
44 return error;
45 do_truncate = true;
46 }
47 error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
48 if (!error && !(file->f_mode & FMODE_OPENED))
49 error = vfs_open(&nd->path, file);
50 if (!error)
51 error = ima_file_check(file, op->acc_mode);
52 if (!error && do_truncate)
53 error = handle_truncate(mnt_userns, file);
54 if (unlikely(error > 0)) {
55 WARN_ON(1);
56 error = -EINVAL;
57 }
58 if (do_truncate)
59 mnt_drop_write(nd->path.mnt);
60 return error;
61}
5. vfs_open 虚拟文件系统打开文件 #
- 设置文件操作集
- 调用操作对应的open函数
1// fs/open.c
2/**
3 * vfs_open - open the file at the given path
4 * @path: path to open
5 * @file: newly allocated file with f_flag initialized
6 * @cred: credentials to use
7 */
8int vfs_open(const struct path *path, struct file *file)
9{
10 file->f_path = *path;
11 return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
12}
13
14// fs/open.c
15static int do_dentry_open(struct file *f,
16 struct inode *inode,
17 int (*open)(struct inode *, struct file *))
18{
19 static const struct file_operations empty_fops = {};
20 int error;
21
22 path_get(&f->f_path);
23 f->f_inode = inode;
24 f->f_mapping = inode->i_mapping;
25 f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
26 f->f_sb_err = file_sample_sb_err(f);
27
28 if (unlikely(f->f_flags & O_PATH)) {
29 f->f_mode = FMODE_PATH | FMODE_OPENED;
30 f->f_op = &empty_fops;
31 return 0;
32 }
33
34 if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
35 error = get_write_access(inode);
36 if (unlikely(error))
37 goto cleanup_file;
38 error = __mnt_want_write(f->f_path.mnt);
39 if (unlikely(error)) {
40 put_write_access(inode);
41 goto cleanup_file;
42 }
43 f->f_mode |= FMODE_WRITER;
44 }
45
46 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
47 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
48 f->f_mode |= FMODE_ATOMIC_POS;
49
50 // 设置文件相关的方法
51 f->f_op = fops_get(inode->i_fop);
52 if (WARN_ON(!f->f_op)) {
53 error = -ENODEV;
54 goto cleanup_all;
55 }
56
57 error = security_file_open(f);
58 if (error)
59 goto cleanup_all;
60
61 error = break_lease(locks_inode(f), f->f_flags);
62 if (error)
63 goto cleanup_all;
64
65 /* normally all 3 are set; ->open() can clear them if needed */
66 f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
67 // 这里找对应的f_op中的open进行调用
68 if (!open)
69 open = f->f_op->open;
70 if (open) {
71 error = open(inode, f);
72 if (error)
73 goto cleanup_all;
74 }
75 // 设置文件打开标记
76 f->f_mode |= FMODE_OPENED;
77 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
78 i_readcount_inc(inode);
79 if ((f->f_mode & FMODE_READ) &&
80 likely(f->f_op->read || f->f_op->read_iter))
81 f->f_mode |= FMODE_CAN_READ;
82 if ((f->f_mode & FMODE_WRITE) &&
83 likely(f->f_op->write || f->f_op->write_iter))
84 f->f_mode |= FMODE_CAN_WRITE;
85 if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
86 f->f_mode |= FMODE_CAN_ODIRECT;
87
88 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
89
90 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
91
92 if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
93 return -EINVAL;
94
95 /*
96 * XXX: Huge page cache doesn't support writing yet. Drop all page
97 * cache for this file before processing writes.
98 */
99 if (f->f_mode & FMODE_WRITE) {
100 /*
101 * Paired with smp_mb() in collapse_file() to ensure nr_thps
102 * is up to date and the update to i_writecount by
103 * get_write_access() is visible. Ensures subsequent insertion
104 * of THPs into the page cache will fail.
105 */
106 smp_mb();
107 if (filemap_nr_thps(inode->i_mapping)) {
108 struct address_space *mapping = inode->i_mapping;
109
110 filemap_invalidate_lock(inode->i_mapping);
111 /*
112 * unmap_mapping_range just need to be called once
113 * here, because the private pages is not need to be
114 * unmapped mapping (e.g. data segment of dynamic
115 * shared libraries here).
116 */
117 unmap_mapping_range(mapping, 0, 0, 0);
118 truncate_inode_pages(mapping, 0);
119 filemap_invalidate_unlock(inode->i_mapping);
120 }
121 }
122
123 return 0;
124
125cleanup_all:
126 if (WARN_ON_ONCE(error > 0))
127 error = -EINVAL;
128 fops_put(f->f_op);
129 if (f->f_mode & FMODE_WRITER) {
130 put_write_access(inode);
131 __mnt_drop_write(f->f_path.mnt);
132 }
133cleanup_file:
134 path_put(&f->f_path);
135 f->f_path.mnt = NULL;
136 f->f_path.dentry = NULL;
137 f->f_inode = NULL;
138 return error;
139}