通用网络驱动开发知识

一、驱动到网络协议栈处理过程 #

@startuml
box kernel #White
    participant net
    participant ksoftirqd
end box
box driver #White
    participant eth
end box
participant cpu

autonumber
net -> eth: net_device_ops::ndo_open
eth -> cpu: 注册硬件中断处理函数
eth -> net: 调用netif_napi_add注册poll函数到napi列表里面

== 收到包 ==
autonumber 1
cpu -> eth: 来中断了
eth -> ksoftirqd: 设置软中断NET_RX_SOFTIRQ
ksoftirqd -> net: 根据软中断类型调用net_rx_action
net -> eth: 从napi列表中遍历处理,找到对应的poll函数
eth -> eth: 从硬件buffer中读取数据帧创建sk_buff
eth -> net: 将sk_buff通过函数napi_gro_receive调用到内核
net -> net: napi_gro_receive => napi_skb_finish
net -> net: napi_skb_finish => netif_receive_skb
net -> net: netif_receive_skb => deliver_skb
net -> net: deliver_skb => ip_rcv

@enduml

二、驱动定义 #

1. 驱动操作结构体 net_device_ops #

  • 每一个网卡需要定义下面这一系列操作函数
  1// include/linux/netdevice.h
  2/*
  3 * This structure defines the management hooks for network devices.
  4 * The following hooks can be defined; unless noted otherwise, they are
  5 * optional and can be filled with a null pointer.
  6 *
  7 * int (*ndo_init)(struct net_device *dev);
  8 *     This function is called once when a network device is registered.
  9 *     The network device can use this for any late stage initialization
 10 *     or semantic validation. It can fail with an error code which will
 11 *     be propagated back to register_netdev.
 12 *
 13 * void (*ndo_uninit)(struct net_device *dev);
 14 *     This function is called when device is unregistered or when registration
 15 *     fails. It is not called if init fails.
 16 *
 17 * int (*ndo_open)(struct net_device *dev);
 18 *     This function is called when a network device transitions to the up
 19 *     state.
 20 *
 21 * int (*ndo_stop)(struct net_device *dev);
 22 *     This function is called when a network device transitions to the down
 23 *     state.
 24 *
 25 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 26 *                               struct net_device *dev);
 27 *	Called when a packet needs to be transmitted.
 28 *	Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 29 *	the queue before that can happen; it's for obsolete devices and weird
 30 *	corner cases, but the stack really does a non-trivial amount
 31 *	of useless work if you return NETDEV_TX_BUSY.
 32 *	Required; cannot be NULL.
 33 *
 34 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 35 *					   struct net_device *dev
 36 *					   netdev_features_t features);
 37 *	Called by core transmit path to determine if device is capable of
 38 *	performing offload operations on a given packet. This is to give
 39 *	the device an opportunity to implement any restrictions that cannot
 40 *	be otherwise expressed by feature flags. The check is called with
 41 *	the set of features that the stack has calculated and it returns
 42 *	those the driver believes to be appropriate.
 43 *
 44 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 45 *                         struct net_device *sb_dev);
 46 *	Called to decide which queue to use when device supports multiple
 47 *	transmit queues.
 48 *
 49 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 50 *	This function is called to allow device receiver to make
 51 *	changes to configuration when multicast or promiscuous is enabled.
 52 *
 53 * void (*ndo_set_rx_mode)(struct net_device *dev);
 54 *	This function is called device changes address list filtering.
 55 *	If driver handles unicast address filtering, it should set
 56 *	IFF_UNICAST_FLT in its priv_flags.
 57 *
 58 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 59 *	This function  is called when the Media Access Control address
 60 *	needs to be changed. If this interface is not defined, the
 61 *	MAC address can not be changed.
 62 *
 63 * int (*ndo_validate_addr)(struct net_device *dev);
 64 *	Test if Media Access Control address is valid for the device.
 65 *
 66 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 67 *	Old-style ioctl entry point. This is used internally by the
 68 *	appletalk and ieee802154 subsystems but is no longer called by
 69 *	the device ioctl handler.
 70 *
 71 * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 72 *	Used by the bonding driver for its device specific ioctls:
 73 *	SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 74 *	SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 75 *
 76 * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 77 *	Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 78 *	SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
 79 *
 80 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 81 *	Used to set network devices bus interface parameters. This interface
 82 *	is retained for legacy reasons; new devices should use the bus
 83 *	interface (PCI) for low level management.
 84 *
 85 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 86 *	Called when a user wants to change the Maximum Transfer Unit
 87 *	of a device.
 88 *
 89 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 90 *	Callback used when the transmitter has not made any progress
 91 *	for dev->watchdog ticks.
 92 *
 93 * void (*ndo_get_stats64)(struct net_device *dev,
 94 *                         struct rtnl_link_stats64 *storage);
 95 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 96 *	Called when a user wants to get the network device usage
 97 *	statistics. Drivers must do one of the following:
 98 *	1. Define @ndo_get_stats64 to fill in a zero-initialised
 99 *	   rtnl_link_stats64 structure passed by the caller.
100 *	2. Define @ndo_get_stats to update a net_device_stats structure
101 *	   (which should normally be dev->stats) and return a pointer to
102 *	   it. The structure may be changed asynchronously only if each
103 *	   field is written atomically.
104 *	3. Update dev->stats asynchronously and atomically, and define
105 *	   neither operation.
106 *
107 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
108 *	Return true if this device supports offload stats of this attr_id.
109 *
110 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
111 *	void *attr_data)
112 *	Get statistics for offload operations by attr_id. Write it into the
113 *	attr_data pointer.
114 *
115 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
116 *	If device supports VLAN filtering this function is called when a
117 *	VLAN id is registered.
118 *
119 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
120 *	If device supports VLAN filtering this function is called when a
121 *	VLAN id is unregistered.
122 *
123 * void (*ndo_poll_controller)(struct net_device *dev);
124 *
125 *	SR-IOV management functions.
126 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
127 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
128 *			  u8 qos, __be16 proto);
129 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
130 *			  int max_tx_rate);
131 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
132 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
133 * int (*ndo_get_vf_config)(struct net_device *dev,
134 *			    int vf, struct ifla_vf_info *ivf);
135 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
136 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
137 *			  struct nlattr *port[]);
138 *
139 *      Enable or disable the VF ability to query its RSS Redirection Table and
140 *      Hash Key. This is needed since on some devices VF share this information
141 *      with PF and querying it may introduce a theoretical security risk.
142 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
143 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
144 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
145 *		       void *type_data);
146 *	Called to setup any 'tc' scheduler, classifier or action on @dev.
147 *	This is always called from the stack with the rtnl lock held and netif
148 *	tx queues stopped. This allows the netdevice to perform queue
149 *	management safely.
150 *
151 *	Fiber Channel over Ethernet (FCoE) offload functions.
152 * int (*ndo_fcoe_enable)(struct net_device *dev);
153 *	Called when the FCoE protocol stack wants to start using LLD for FCoE
154 *	so the underlying device can perform whatever needed configuration or
155 *	initialization to support acceleration of FCoE traffic.
156 *
157 * int (*ndo_fcoe_disable)(struct net_device *dev);
158 *	Called when the FCoE protocol stack wants to stop using LLD for FCoE
159 *	so the underlying device can perform whatever needed clean-ups to
160 *	stop supporting acceleration of FCoE traffic.
161 *
162 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
163 *			     struct scatterlist *sgl, unsigned int sgc);
164 *	Called when the FCoE Initiator wants to initialize an I/O that
165 *	is a possible candidate for Direct Data Placement (DDP). The LLD can
166 *	perform necessary setup and returns 1 to indicate the device is set up
167 *	successfully to perform DDP on this I/O, otherwise this returns 0.
168 *
169 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
170 *	Called when the FCoE Initiator/Target is done with the DDPed I/O as
171 *	indicated by the FC exchange id 'xid', so the underlying device can
172 *	clean up and reuse resources for later DDP requests.
173 *
174 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
175 *			      struct scatterlist *sgl, unsigned int sgc);
176 *	Called when the FCoE Target wants to initialize an I/O that
177 *	is a possible candidate for Direct Data Placement (DDP). The LLD can
178 *	perform necessary setup and returns 1 to indicate the device is set up
179 *	successfully to perform DDP on this I/O, otherwise this returns 0.
180 *
181 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
182 *			       struct netdev_fcoe_hbainfo *hbainfo);
183 *	Called when the FCoE Protocol stack wants information on the underlying
184 *	device. This information is utilized by the FCoE protocol stack to
185 *	register attributes with Fiber Channel management service as per the
186 *	FC-GS Fabric Device Management Information(FDMI) specification.
187 *
188 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
189 *	Called when the underlying device wants to override default World Wide
190 *	Name (WWN) generation mechanism in FCoE protocol stack to pass its own
191 *	World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
192 *	protocol stack to use.
193 *
194 *	RFS acceleration.
195 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
196 *			    u16 rxq_index, u32 flow_id);
197 *	Set hardware filter for RFS.  rxq_index is the target queue index;
198 *	flow_id is a flow ID to be passed to rps_may_expire_flow() later.
199 *	Return the filter ID on success, or a negative error code.
200 *
201 *	Slave management functions (for bridge, bonding, etc).
202 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
203 *	Called to make another netdev an underling.
204 *
205 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
206 *	Called to release previously enslaved netdev.
207 *
208 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
209 *					    struct sk_buff *skb,
210 *					    bool all_slaves);
211 *	Get the xmit slave of master device. If all_slaves is true, function
212 *	assume all the slaves can transmit.
213 *
214 *      Feature/offload setting functions.
215 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
216 *		netdev_features_t features);
217 *	Adjusts the requested feature flags according to device-specific
218 *	constraints, and returns the resulting flags. Must not modify
219 *	the device state.
220 *
221 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
222 *	Called to update device configuration to new features. Passed
223 *	feature set might be less than what was returned by ndo_fix_features()).
224 *	Must return >0 or -errno if it changed dev->features itself.
225 *
226 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
227 *		      struct net_device *dev,
228 *		      const unsigned char *addr, u16 vid, u16 flags,
229 *		      struct netlink_ext_ack *extack);
230 *	Adds an FDB entry to dev for addr.
231 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
232 *		      struct net_device *dev,
233 *		      const unsigned char *addr, u16 vid)
234 *	Deletes the FDB entry from dev coresponding to addr.
235 * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[],
236 *			   struct net_device *dev,
237 *			   u16 vid,
238 *			   struct netlink_ext_ack *extack);
239 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
240 *		       struct net_device *dev, struct net_device *filter_dev,
241 *		       int *idx)
242 *	Used to add FDB entries to dump requests. Implementers should add
243 *	entries to skb and update idx with the number of entries.
244 *
245 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
246 *			     u16 flags, struct netlink_ext_ack *extack)
247 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
248 *			     struct net_device *dev, u32 filter_mask,
249 *			     int nlflags)
250 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
251 *			     u16 flags);
252 *
253 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
254 *	Called to change device carrier. Soft-devices (like dummy, team, etc)
255 *	which do not represent real hardware may define this to allow their
256 *	userspace components to manage their virtual carrier state. Devices
257 *	that determine carrier state from physical hardware properties (eg
258 *	network cables) or protocol-dependent mechanisms (eg
259 *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
260 *
261 * int (*ndo_get_phys_port_id)(struct net_device *dev,
262 *			       struct netdev_phys_item_id *ppid);
263 *	Called to get ID of physical port of this device. If driver does
264 *	not implement this, it is assumed that the hw is not able to have
265 *	multiple net devices on single physical port.
266 *
267 * int (*ndo_get_port_parent_id)(struct net_device *dev,
268 *				 struct netdev_phys_item_id *ppid)
269 *	Called to get the parent ID of the physical port of this device.
270 *
271 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
272 *				 struct net_device *dev)
273 *	Called by upper layer devices to accelerate switching or other
274 *	station functionality into hardware. 'pdev is the lowerdev
275 *	to use for the offload and 'dev' is the net device that will
276 *	back the offload. Returns a pointer to the private structure
277 *	the upper layer will maintain.
278 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
279 *	Called by upper layer device to delete the station created
280 *	by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
281 *	the station and priv is the structure returned by the add
282 *	operation.
283 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
284 *			     int queue_index, u32 maxrate);
285 *	Called when a user wants to set a max-rate limitation of specific
286 *	TX queue.
287 * int (*ndo_get_iflink)(const struct net_device *dev);
288 *	Called to get the iflink value of this device.
289 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
290 *	This function is used to get egress tunnel information for given skb.
291 *	This is useful for retrieving outer tunnel header parameters while
292 *	sampling packet.
293 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
294 *	This function is used to specify the headroom that the skb must
295 *	consider when allocation skb during packet reception. Setting
296 *	appropriate rx headroom value allows avoiding skb head copy on
297 *	forward. Setting a negative value resets the rx headroom to the
298 *	default value.
299 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
300 *	This function is used to set or query state related to XDP on the
301 *	netdevice and manage BPF offload. See definition of
302 *	enum bpf_netdev_command for details.
303 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
304 *			u32 flags);
305 *	This function is used to submit @n XDP packets for transmit on a
306 *	netdevice. Returns number of frames successfully transmitted, frames
307 *	that got dropped are freed/returned via xdp_return_frame().
308 *	Returns negative number, means general error invoking ndo, meaning
309 *	no frames were xmit'ed and core-caller will free all frames.
310 * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
311 *					        struct xdp_buff *xdp);
312 *      Get the xmit slave of master device based on the xdp_buff.
313 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
314 *      This function is used to wake up the softirq, ksoftirqd or kthread
315 *	responsible for sending and/or receiving packets on a specific
316 *	queue id bound to an AF_XDP socket. The flags field specifies if
317 *	only RX, only Tx, or both should be woken up using the flags
318 *	XDP_WAKEUP_RX and XDP_WAKEUP_TX.
319 * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
320 *	Get devlink port instance associated with a given netdev.
321 *	Called with a reference on the netdevice and devlink locks only,
322 *	rtnl_lock is not held.
323 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
324 *			 int cmd);
325 *	Add, change, delete or get information on an IPv4 tunnel.
326 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
327 *	If a device is paired with a peer device, return the peer instance.
328 *	The caller must be under RCU read context.
329 * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
330 *     Get the forwarding path to reach the real device from the HW destination address
331 * ktime_t (*ndo_get_tstamp)(struct net_device *dev,
332 *			     const struct skb_shared_hwtstamps *hwtstamps,
333 *			     bool cycles);
334 *	Get hardware timestamp based on normal/adjustable time or free running
335 *	cycle counter. This function is required if physical clock supports a
336 *	free running cycle counter.
337 */
338struct net_device_ops {
339	int			(*ndo_init)(struct net_device *dev);
340	void			(*ndo_uninit)(struct net_device *dev);
341	int			(*ndo_open)(struct net_device *dev);
342	int			(*ndo_stop)(struct net_device *dev);
343	netdev_tx_t		(*ndo_start_xmit)(struct sk_buff *skb,
344						  struct net_device *dev);
345	netdev_features_t	(*ndo_features_check)(struct sk_buff *skb,
346						      struct net_device *dev,
347						      netdev_features_t features);
348	u16			(*ndo_select_queue)(struct net_device *dev,
349						    struct sk_buff *skb,
350						    struct net_device *sb_dev);
351	void			(*ndo_change_rx_flags)(struct net_device *dev,
352						       int flags);
353	void			(*ndo_set_rx_mode)(struct net_device *dev);
354	int			(*ndo_set_mac_address)(struct net_device *dev,
355						       void *addr);
356	int			(*ndo_validate_addr)(struct net_device *dev);
357	int			(*ndo_do_ioctl)(struct net_device *dev,
358					        struct ifreq *ifr, int cmd);
359	int			(*ndo_eth_ioctl)(struct net_device *dev,
360						 struct ifreq *ifr, int cmd);
361	int			(*ndo_siocbond)(struct net_device *dev,
362						struct ifreq *ifr, int cmd);
363	int			(*ndo_siocwandev)(struct net_device *dev,
364						  struct if_settings *ifs);
365	int			(*ndo_siocdevprivate)(struct net_device *dev,
366						      struct ifreq *ifr,
367						      void __user *data, int cmd);
368	int			(*ndo_set_config)(struct net_device *dev,
369					          struct ifmap *map);
370	int			(*ndo_change_mtu)(struct net_device *dev,
371						  int new_mtu);
372	int			(*ndo_neigh_setup)(struct net_device *dev,
373						   struct neigh_parms *);
374	void			(*ndo_tx_timeout) (struct net_device *dev,
375						   unsigned int txqueue);
376
377	void			(*ndo_get_stats64)(struct net_device *dev,
378						   struct rtnl_link_stats64 *storage);
379	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
380	int			(*ndo_get_offload_stats)(int attr_id,
381							 const struct net_device *dev,
382							 void *attr_data);
383	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
384
385	int			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
386						       __be16 proto, u16 vid);
387	int			(*ndo_vlan_rx_kill_vid)(struct net_device *dev,
388						        __be16 proto, u16 vid);
389#ifdef CONFIG_NET_POLL_CONTROLLER
390	void                    (*ndo_poll_controller)(struct net_device *dev);
391	int			(*ndo_netpoll_setup)(struct net_device *dev,
392						     struct netpoll_info *info);
393	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
394#endif
395	int			(*ndo_set_vf_mac)(struct net_device *dev,
396						  int queue, u8 *mac);
397	int			(*ndo_set_vf_vlan)(struct net_device *dev,
398						   int queue, u16 vlan,
399						   u8 qos, __be16 proto);
400	int			(*ndo_set_vf_rate)(struct net_device *dev,
401						   int vf, int min_tx_rate,
402						   int max_tx_rate);
403	int			(*ndo_set_vf_spoofchk)(struct net_device *dev,
404						       int vf, bool setting);
405	int			(*ndo_set_vf_trust)(struct net_device *dev,
406						    int vf, bool setting);
407	int			(*ndo_get_vf_config)(struct net_device *dev,
408						     int vf,
409						     struct ifla_vf_info *ivf);
410	int			(*ndo_set_vf_link_state)(struct net_device *dev,
411							 int vf, int link_state);
412	int			(*ndo_get_vf_stats)(struct net_device *dev,
413						    int vf,
414						    struct ifla_vf_stats
415						    *vf_stats);
416	int			(*ndo_set_vf_port)(struct net_device *dev,
417						   int vf,
418						   struct nlattr *port[]);
419	int			(*ndo_get_vf_port)(struct net_device *dev,
420						   int vf, struct sk_buff *skb);
421	int			(*ndo_get_vf_guid)(struct net_device *dev,
422						   int vf,
423						   struct ifla_vf_guid *node_guid,
424						   struct ifla_vf_guid *port_guid);
425	int			(*ndo_set_vf_guid)(struct net_device *dev,
426						   int vf, u64 guid,
427						   int guid_type);
428	int			(*ndo_set_vf_rss_query_en)(
429						   struct net_device *dev,
430						   int vf, bool setting);
431	int			(*ndo_setup_tc)(struct net_device *dev,
432						enum tc_setup_type type,
433						void *type_data);
434#if IS_ENABLED(CONFIG_FCOE)
435	int			(*ndo_fcoe_enable)(struct net_device *dev);
436	int			(*ndo_fcoe_disable)(struct net_device *dev);
437	int			(*ndo_fcoe_ddp_setup)(struct net_device *dev,
438						      u16 xid,
439						      struct scatterlist *sgl,
440						      unsigned int sgc);
441	int			(*ndo_fcoe_ddp_done)(struct net_device *dev,
442						     u16 xid);
443	int			(*ndo_fcoe_ddp_target)(struct net_device *dev,
444						       u16 xid,
445						       struct scatterlist *sgl,
446						       unsigned int sgc);
447	int			(*ndo_fcoe_get_hbainfo)(struct net_device *dev,
448							struct netdev_fcoe_hbainfo *hbainfo);
449#endif
450
451#if IS_ENABLED(CONFIG_LIBFCOE)
452#define NETDEV_FCOE_WWNN 0
453#define NETDEV_FCOE_WWPN 1
454	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
455						    u64 *wwn, int type);
456#endif
457
458#ifdef CONFIG_RFS_ACCEL
459	int			(*ndo_rx_flow_steer)(struct net_device *dev,
460						     const struct sk_buff *skb,
461						     u16 rxq_index,
462						     u32 flow_id);
463#endif
464	int			(*ndo_add_slave)(struct net_device *dev,
465						 struct net_device *slave_dev,
466						 struct netlink_ext_ack *extack);
467	int			(*ndo_del_slave)(struct net_device *dev,
468						 struct net_device *slave_dev);
469	struct net_device*	(*ndo_get_xmit_slave)(struct net_device *dev,
470						      struct sk_buff *skb,
471						      bool all_slaves);
472	struct net_device*	(*ndo_sk_get_lower_dev)(struct net_device *dev,
473							struct sock *sk);
474	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
475						    netdev_features_t features);
476	int			(*ndo_set_features)(struct net_device *dev,
477						    netdev_features_t features);
478	int			(*ndo_neigh_construct)(struct net_device *dev,
479						       struct neighbour *n);
480	void			(*ndo_neigh_destroy)(struct net_device *dev,
481						     struct neighbour *n);
482
483	int			(*ndo_fdb_add)(struct ndmsg *ndm,
484					       struct nlattr *tb[],
485					       struct net_device *dev,
486					       const unsigned char *addr,
487					       u16 vid,
488					       u16 flags,
489					       struct netlink_ext_ack *extack);
490	int			(*ndo_fdb_del)(struct ndmsg *ndm,
491					       struct nlattr *tb[],
492					       struct net_device *dev,
493					       const unsigned char *addr,
494					       u16 vid, struct netlink_ext_ack *extack);
495	int			(*ndo_fdb_del_bulk)(struct ndmsg *ndm,
496						    struct nlattr *tb[],
497						    struct net_device *dev,
498						    u16 vid,
499						    struct netlink_ext_ack *extack);
500	int			(*ndo_fdb_dump)(struct sk_buff *skb,
501						struct netlink_callback *cb,
502						struct net_device *dev,
503						struct net_device *filter_dev,
504						int *idx);
505	int			(*ndo_fdb_get)(struct sk_buff *skb,
506					       struct nlattr *tb[],
507					       struct net_device *dev,
508					       const unsigned char *addr,
509					       u16 vid, u32 portid, u32 seq,
510					       struct netlink_ext_ack *extack);
511	int			(*ndo_bridge_setlink)(struct net_device *dev,
512						      struct nlmsghdr *nlh,
513						      u16 flags,
514						      struct netlink_ext_ack *extack);
515	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
516						      u32 pid, u32 seq,
517						      struct net_device *dev,
518						      u32 filter_mask,
519						      int nlflags);
520	int			(*ndo_bridge_dellink)(struct net_device *dev,
521						      struct nlmsghdr *nlh,
522						      u16 flags);
523	int			(*ndo_change_carrier)(struct net_device *dev,
524						      bool new_carrier);
525	int			(*ndo_get_phys_port_id)(struct net_device *dev,
526							struct netdev_phys_item_id *ppid);
527	int			(*ndo_get_port_parent_id)(struct net_device *dev,
528							  struct netdev_phys_item_id *ppid);
529	int			(*ndo_get_phys_port_name)(struct net_device *dev,
530							  char *name, size_t len);
531	void*			(*ndo_dfwd_add_station)(struct net_device *pdev,
532							struct net_device *dev);
533	void			(*ndo_dfwd_del_station)(struct net_device *pdev,
534							void *priv);
535
536	int			(*ndo_set_tx_maxrate)(struct net_device *dev,
537						      int queue_index,
538						      u32 maxrate);
539	int			(*ndo_get_iflink)(const struct net_device *dev);
540	int			(*ndo_fill_metadata_dst)(struct net_device *dev,
541						       struct sk_buff *skb);
542	void			(*ndo_set_rx_headroom)(struct net_device *dev,
543						       int needed_headroom);
544	int			(*ndo_bpf)(struct net_device *dev,
545					   struct netdev_bpf *bpf);
546	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
547						struct xdp_frame **xdp,
548						u32 flags);
549	struct net_device *	(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
550							  struct xdp_buff *xdp);
551	int			(*ndo_xsk_wakeup)(struct net_device *dev,
552						  u32 queue_id, u32 flags);
553	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
554	int			(*ndo_tunnel_ctl)(struct net_device *dev,
555						  struct ip_tunnel_parm *p, int cmd);
556	struct net_device *	(*ndo_get_peer_dev)(struct net_device *dev);
557	int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
558                                                         struct net_device_path *path);
559	ktime_t			(*ndo_get_tstamp)(struct net_device *dev,
560						  const struct skb_shared_hwtstamps *hwtstamps,
561						  bool cycles);
562};

二、软中断处理 #

  • net_dev_init时,会将NET_RX_SOFTIRQNET_TX_SOFTIRQ注册两个回调函数
 1// net/core/dev.c
 2/*
 3 *	Initialize the DEV module. At boot time this walks the device list and
 4 *	unhooks any devices that fail to initialise (normally hardware not
 5 *	present) and leaves us with a valid list of present and active devices.
 6 *
 7 */
 8
 9/*
10 *       This is called single threaded during boot, so no need
11 *       to take the rtnl semaphore.
12 */
13static int __init net_dev_init(void)
14{
15    ...
16	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
17	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
18    ...
19}

1. 接收软中断NET_TX_SOFTIRQ #

  • 处理函数
 1// net/core/dev.c
 2static __latent_entropy void net_rx_action(struct softirq_action *h)
 3{
 4    ...
 5	for (;;) {
 6		struct napi_struct *n;
 7
 8		skb_defer_free_flush(sd);
 9
10		if (list_empty(&list)) {
11			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
12				goto end;
13			break;
14		}
15
16		n = list_first_entry(&list, struct napi_struct, poll_list);
17		budget -= napi_poll(n, &repoll);
18        ...
19	}
20    ...
21}
  • 给到napi_poll找对应的poll函数调用
 1// net/core/dev.c
 2static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 3{
 4	bool do_repoll = false;
 5	void *have;
 6	int work;
 7
 8	list_del_init(&n->poll_list);
 9
10	have = netpoll_poll_lock(n);
11
12	work = __napi_poll(n, &do_repoll);
13
14	if (do_repoll)
15		list_add_tail(&n->poll_list, repoll);
16
17	netpoll_poll_unlock(have);
18
19	return work;
20}
21
22// net/core/dev.c
23static int __napi_poll(struct napi_struct *n, bool *repoll)
24{
25	int work, weight;
26
27	weight = n->weight;
28
29	/* This NAPI_STATE_SCHED test is for avoiding a race
30	 * with netpoll's poll_napi().  Only the entity which
31	 * obtains the lock and sees NAPI_STATE_SCHED set will
32	 * actually make the ->poll() call.  Therefore we avoid
33	 * accidentally calling ->poll() when NAPI is not scheduled.
34	 */
35	work = 0;
36	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
37		work = n->poll(n, weight);
38		trace_napi_poll(n, work, weight);
39	}
40
41	if (unlikely(work > weight))
42		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
43				n->poll, work, weight);
44
45	if (likely(work < weight))
46		return work;
47
48	/* Drivers must not modify the NAPI state if they
49	 * consume the entire weight.  In such cases this code
50	 * still "owns" the NAPI instance and therefore can
51	 * move the instance around on the list at-will.
52	 */
53	if (unlikely(napi_disable_pending(n))) {
54		napi_complete(n);
55		return work;
56	}
57
58	/* The NAPI context has more processing work, but busy-polling
59	 * is preferred. Exit early.
60	 */
61	if (napi_prefer_busy_poll(n)) {
62		if (napi_complete_done(n, work)) {
63			/* If timeout is not set, we need to make sure
64			 * that the NAPI is re-scheduled.
65			 */
66			napi_schedule(n);
67		}
68		return work;
69	}
70
71	if (n->gro_bitmask) {
72		/* flush too old packets
73		 * If HZ < 1000, flush all packets.
74		 */
75		napi_gro_flush(n, HZ >= 1000);
76	}
77
78	gro_normal_list(n);
79
80	/* Some drivers may have called napi_schedule
81	 * prior to exhausting their budget.
82	 */
83	if (unlikely(!list_empty(&n->poll_list))) {
84		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
85			     n->dev ? n->dev->name : "backlog");
86		return work;
87	}
88
89	*repoll = true;
90
91	return work;
92}
  • 上面调用poll函数后,就到驱动里面处理,驱动里面会调用到napi_gro_receive
 1// net/core/gro.c
 2gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 3{
 4	gro_result_t ret;
 5
 6	skb_mark_napi_id(skb, napi);
 7	trace_napi_gro_receive_entry(skb);
 8
 9	skb_gro_reset_offset(skb, 0);
10
11	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
12	trace_napi_gro_receive_exit(ret);
13
14	return ret;
15}
16EXPORT_SYMBOL(napi_gro_receive);
  • 调用到napi_skb_finish,转netif_receive_skb
  1// net/core/gro.c
  2static gro_result_t napi_skb_finish(struct napi_struct *napi,
  3				    struct sk_buff *skb,
  4				    gro_result_t ret)
  5{
  6	switch (ret) {
  7	case GRO_NORMAL:
  8		gro_normal_one(napi, skb, 1);
  9		break;
 10
 11	case GRO_MERGED_FREE:
 12		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 13			napi_skb_free_stolen_head(skb);
 14		else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 15			__kfree_skb(skb);
 16		else
 17			__kfree_skb_defer(skb);
 18		break;
 19
 20	case GRO_HELD:
 21	case GRO_MERGED:
 22	case GRO_CONSUMED:
 23		break;
 24	}
 25
 26	return ret;
 27}
 28
 29// include/net/gro.h
 30/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 31 * pass the whole batch up to the stack.
 32 */
 33static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
 34{
 35	list_add_tail(&skb->list, &napi->rx_list);
 36	napi->rx_count += segs;
 37	if (napi->rx_count >= gro_normal_batch)
 38		gro_normal_list(napi);
 39}
 40
 41// include/net/gro.h
 42/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 43static inline void gro_normal_list(struct napi_struct *napi)
 44{
 45	if (!napi->rx_count)
 46		return;
 47	netif_receive_skb_list_internal(&napi->rx_list);
 48	INIT_LIST_HEAD(&napi->rx_list);
 49	napi->rx_count = 0;
 50}
 51
 52// net/core/dev.c
 53void netif_receive_skb_list_internal(struct list_head *head)
 54{
 55	struct sk_buff *skb, *next;
 56	struct list_head sublist;
 57
 58	INIT_LIST_HEAD(&sublist);
 59	list_for_each_entry_safe(skb, next, head, list) {
 60		net_timestamp_check(netdev_tstamp_prequeue, skb);
 61		skb_list_del_init(skb);
 62		if (!skb_defer_rx_timestamp(skb))
 63			list_add_tail(&skb->list, &sublist);
 64	}
 65	list_splice_init(&sublist, head);
 66
 67	rcu_read_lock();
 68#ifdef CONFIG_RPS
 69	if (static_branch_unlikely(&rps_needed)) {
 70		list_for_each_entry_safe(skb, next, head, list) {
 71			struct rps_dev_flow voidflow, *rflow = &voidflow;
 72			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 73
 74			if (cpu >= 0) {
 75				/* Will be handled, remove from list */
 76				skb_list_del_init(skb);
 77				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 78			}
 79		}
 80	}
 81#endif
 82	__netif_receive_skb_list(head);
 83	rcu_read_unlock();
 84}
 85
 86// net/core/dev.c
 87static void __netif_receive_skb_list(struct list_head *head)
 88{
 89	unsigned long noreclaim_flag = 0;
 90	struct sk_buff *skb, *next;
 91	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 92
 93	list_for_each_entry_safe(skb, next, head, list) {
 94		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 95			struct list_head sublist;
 96
 97			/* Handle the previous sublist */
 98			list_cut_before(&sublist, head, &skb->list);
 99			if (!list_empty(&sublist))
100				__netif_receive_skb_list_core(&sublist, pfmemalloc);
101			pfmemalloc = !pfmemalloc;
102			/* See comments in __netif_receive_skb */
103			if (pfmemalloc)
104				noreclaim_flag = memalloc_noreclaim_save();
105			else
106				memalloc_noreclaim_restore(noreclaim_flag);
107		}
108	}
109	/* Handle the remaining sublist */
110	if (!list_empty(head))
111		__netif_receive_skb_list_core(head, pfmemalloc);
112	/* Restore pflags */
113	if (pfmemalloc)
114		memalloc_noreclaim_restore(noreclaim_flag);
115}
116
117// net/core/dev.c
118static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
119{
120	/* Fast-path assumptions:
121	 * - There is no RX handler.
122	 * - Only one packet_type matches.
123	 * If either of these fails, we will end up doing some per-packet
124	 * processing in-line, then handling the 'last ptype' for the whole
125	 * sublist.  This can't cause out-of-order delivery to any single ptype,
126	 * because the 'last ptype' must be constant across the sublist, and all
127	 * other ptypes are handled per-packet.
128	 */
129	/* Current (common) ptype of sublist */
130	struct packet_type *pt_curr = NULL;
131	/* Current (common) orig_dev of sublist */
132	struct net_device *od_curr = NULL;
133	struct list_head sublist;
134	struct sk_buff *skb, *next;
135
136	INIT_LIST_HEAD(&sublist);
137	list_for_each_entry_safe(skb, next, head, list) {
138		struct net_device *orig_dev = skb->dev;
139		struct packet_type *pt_prev = NULL;
140
141		skb_list_del_init(skb);
142		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
143		if (!pt_prev)
144			continue;
145		if (pt_curr != pt_prev || od_curr != orig_dev) {
146			/* dispatch old sublist */
147			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
148			/* start new sublist */
149			INIT_LIST_HEAD(&sublist);
150			pt_curr = pt_prev;
151			od_curr = orig_dev;
152		}
153		list_add_tail(&skb->list, &sublist);
154	}
155
156	/* dispatch final sublist */
157	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
158}
  • netif_receive_skb_core里面处理包,并添加了trace入口,可以从这个函数进行追踪
  • tcpdump抓包同样也在这里挂载了虚拟协议,不过处理是链式,不考虑返回值,所以所有链上的设备都可以处理此包
 1// net/core/dev.c
 2static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 3				    struct packet_type **ppt_prev)
 4{
 5	struct packet_type *ptype, *pt_prev;
 6	rx_handler_func_t *rx_handler;
 7	struct sk_buff *skb = *pskb;
 8	struct net_device *orig_dev;
 9	bool deliver_exact = false;
10	int ret = NET_RX_DROP;
11	__be16 type;
12
13	net_timestamp_check(!netdev_tstamp_prequeue, skb);
14
15	// 这里插入的tracepoint
16	trace_netif_receive_skb(skb);
17
18	orig_dev = skb->dev;
19
20	skb_reset_network_header(skb);
21	if (!skb_transport_header_was_set(skb))
22		skb_reset_transport_header(skb);
23	skb_reset_mac_len(skb);
24
25	pt_prev = NULL;
26
27another_round:
28	skb->skb_iif = skb->dev->ifindex;
29
30	__this_cpu_inc(softnet_data.processed);
31
32	if (static_branch_unlikely(&generic_xdp_needed_key)) {
33		int ret2;
34
35		migrate_disable();
36		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
37		migrate_enable();
38
39		if (ret2 != XDP_PASS) {
40			ret = NET_RX_DROP;
41			goto out;
42		}
43	}
44
45	if (eth_type_vlan(skb->protocol)) {
46		skb = skb_vlan_untag(skb);
47		if (unlikely(!skb))
48			goto out;
49	}
50
51	if (skb_skip_tc_classify(skb))
52		goto skip_classify;
53
54	if (pfmemalloc)
55		goto skip_taps;
56
57	// 这一步是tcpdump的抓包入口,tcpdump在这里挂了虚拟协议
58	list_for_each_entry_rcu(ptype, &ptype_all, list) {
59		if (pt_prev)
60			ret = deliver_skb(skb, pt_prev, orig_dev);
61		pt_prev = ptype;
62	}
63
64	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
65		if (pt_prev)
66			ret = deliver_skb(skb, pt_prev, orig_dev);
67		pt_prev = ptype;
68	}
69	...
70}
  • deliver_skb就会调用注册的func函数
 1/* 到这一步的堆栈信息
 2ip_rcv(struct sk_buff * skb, struct net_device * dev, struct packet_type * pt, struct net_device * orig_dev) (/net/ipv4/ip_input.c:565)
 3deliver_skb(struct packet_type * pt_prev) (/net/core/dev.c:2189)
 4deliver_ptype_list_skb(struct list_head * ptype_list, __be16 type, struct net_device * orig_dev, struct packet_type ** pt, struct sk_buff * skb) (/net/core/dev.c:2204)
 5__netif_receive_skb_core(struct sk_buff ** pskb, bool pfmemalloc, struct packet_type ** ppt_prev) (/net/core/dev.c:5440)
 6__netif_receive_skb_list_core(struct list_head * head, bool pfmemalloc) (/net/core/dev.c:5560)
 7__netif_receive_skb_list(struct list_head * head) (/net/core/dev.c:5627)
 8netif_receive_skb_list_internal(struct list_head * head) (/net/core/dev.c:5718)
 9gro_normal_list(struct napi_struct * napi) (/include/net/gro.h:430)
10gro_normal_list(struct napi_struct * napi) (/include/net/gro.h:426)
11napi_complete_done(struct napi_struct * n, int work_done) (/net/core/dev.c:6059)
12e1000_clean(struct napi_struct * napi, int budget) (/drivers/net/ethernet/intel/e1000/e1000_main.c:3811)
13__napi_poll(struct napi_struct * n, bool * repoll) (/net/core/dev.c:6492)
14napi_poll(struct list_head * repoll, struct napi_struct * n) (/net/core/dev.c:6559)
15net_rx_action(struct softirq_action * h) (/net/core/dev.c:6670)
16__do_softirq() (/kernel/softirq.c:571)
17invoke_softirq() (/kernel/softirq.c:445)
18__irq_exit_rcu() (/kernel/softirq.c:650)
19irq_exit_rcu() (/kernel/softirq.c:662)
20common_interrupt(struct pt_regs * regs, unsigned long error_code) (/arch/x86/kernel/irq.c:240)
21 */
22// net/core/dev.c
23static inline int deliver_skb(struct sk_buff *skb,
24			      struct packet_type *pt_prev,
25			      struct net_device *orig_dev)
26{
27	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
28		return -ENOMEM;
29	refcount_inc(&skb->users);
30	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
31}
  • 注册函数是下面的
 1// net/core/dev.c
 2/**
 3 *	dev_add_pack - add packet handler
 4 *	@pt: packet type declaration
 5 *
 6 *	Add a protocol handler to the networking stack. The passed &packet_type
 7 *	is linked into kernel lists and may not be freed until it has been
 8 *	removed from the kernel lists.
 9 *
10 *	This call does not sleep therefore it can not
11 *	guarantee all CPU's that are in middle of receiving packets
12 *	will see the new packet type (until the next received packet).
13 */
14
15void dev_add_pack(struct packet_type *pt)
16{
17	struct list_head *head = ptype_head(pt);
18
19	spin_lock(&ptype_lock);
20	list_add_rcu(&pt->list, head);
21	spin_unlock(&ptype_lock);
22}
23EXPORT_SYMBOL(dev_add_pack);
  • ipv4就是注册了此接口,所以ipv4包会调用到ip_rcv
 1// net/ipv4/af_inet.c
 2static struct packet_type ip_packet_type __read_mostly = {
 3	.type = cpu_to_be16(ETH_P_IP),
 4	.func = ip_rcv,
 5	.list_func = ip_list_rcv,
 6};
 7
 8// net/ipv4/af_inet.c
 9static int __init inet_init(void)
10{
11	...
12    // 注册到dev里面
13	dev_add_pack(&ip_packet_type);
14	...
15}
16
17fs_initcall(inet_init);