Dec 24, 2023 9 min read linux进程管理

linux进程管理-1描述符和状态

1、task_struct数据结构

描述进程状态的基础数据结构名称是task_struct，在内核的include/linux/sched.h中定义。

随着内核的发展，这个结构体里的内容越来越多，6.6的版本上，现在直接800+行，下边截取一些关键的片段。

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	unsigned int			__state; // 进程的几种状态，running、dead、zombie、d状态等
......
	int				prio; // 进程优先级
	int				static_prio;
	int				normal_prio;
	unsigned int			rt_priority;
	struct sched_entity		se; // 进程的调度数据结构
	struct sched_rt_entity		rt;
	struct sched_dl_entity		dl;
	const struct sched_class	*sched_class;
#ifdef CONFIG_CGROUP_SCHED
	struct task_group		*sched_task_group; // 进程调度的cgroup组
#endif
......
	unsigned int			policy;
	int				nr_cpus_allowed; // 允许调度的CPU
	const cpumask_t			*cpus_ptr; // 进程所在的CPU mask数据结构
	cpumask_t			*user_cpus_ptr;
	cpumask_t			cpus_mask;
	void				*migration_pending;
	struct sched_info		sched_info; // 有关任务调度的统计信息
	struct list_head		tasks;
	struct mm_struct		*mm; // 进程的内存相关信息
	struct mm_struct		*active_mm;
	...... // 后边都是表示内存的数据结构
	struct restart_block		restart_block;
	pid_t				pid; // 表示进程基础的pid和tgid（进程组号）
	pid_t				tgid;
	/* Real parent process: */
	struct task_struct __rcu	*real_parent; // 一个双向链表原来管理task struct（创建该进程的父进程）
	/* Recipient of SIGCHLD, wait4() reports: */
	struct task_struct __rcu	*parent; // 接收到 SIGCHLD 信号的进程
	/*
	 * Children/sibling form the list of natural children:
	 */
	struct list_head		children; // 树状结构来管理各个task strcut
	struct list_head		sibling;
	struct task_struct		*group_leader;
	/* PID/PID hash table linkage. */
	struct pid			*thread_pid; // 各个task struct的hash表
	struct hlist_node		pid_links[PIDTYPE_MAX];
	struct list_head		thread_group;
	struct list_head		thread_node;
	/*
	 * executable name, excluding path.
	 *
	 * - normally initialized setup_new_exec()
	 * - access it with [gs]et_task_comm()
	 * - lock it with task_lock()
	 */
	char				comm[TASK_COMM_LEN]; //进程名，最大16个字符
	struct nameidata		*nameidata;
	/* Filesystem information: */
	struct fs_struct		*fs; // 进程的文件系统相关资源
	/* Open file information: */
	struct files_struct		*files; // 进程的文件相关资源
	/* Signal handlers: */
	struct signal_struct		*signal; // 进程调度信息
	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
	spinlock_t			alloc_lock; // 该进程的旋锁
	struct wake_q_node		wake_q; // 用来唤醒进程调度，即调度的时候体现出来的wakeup事件
	/* CPU-specific state of this task: */
	struct thread_struct		thread;
	/*
	 * WARNING: on x86, 'thread_struct' contains a variable-sized
	 * structure.  It *MUST* be at the end of 'task_struct'.
	 *
	 * Do not put anything below here!
	 */
};

从task_struct中可以不难看到，内核中管理各个进程的方式一共有三种：

双向链表
树

可以比较方便的用来查看进程之间的父子关系

哈希

其实主要是通过一个表示进程pid的hash来管理，方便通过pid号直接找到对应进程的task_struct信息。

通过kernel/pid.c里的相关函数进行管理，比如find_pid_ns或者find_vpid，而网上常说的其实早已经被删除了。

#define pid_hashfn(x)    hash_long((unsigned long)x, pidhash_shift)

（这里吐槽一下，中文互联网真的是喜欢传播旧的东西。一个v2.6的代码，被无数人粘贴复制，这篇2023年的文章，还在用这个宏https://developer.aliyun.com/article/1204882。）

但是很奇怪的是，下边这个commit又说删除了pidhash的相关数据结构，这里可能还有后半再看下，是不是将pidhash的数据结构放在了 idr（ID Radix 树）中，而且确实从代码中发现，只剩下idr init，而没有了hash init。

From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: [PATCH] pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

2、进程状态

Linux的进程中主要有5种状态:

R (TASK_RUNNING)，就绪态。
S (TASK_INTERRUPTIBLE)，可中断的睡眠状态
D (TASK_UNINTERRUPTIBLE)，不可中断的睡眠状态。
Z (TASK_DEAD - EXIT_ZOMBIE)，僵尸态。
T (TASK_STOPPED or TASK_TRACED)，暂停状态或跟踪状态。

#define TASK_RUNNING			0x00000000
#define TASK_INTERRUPTIBLE		0x00000001
#define TASK_UNINTERRUPTIBLE		0x00000002
#define __TASK_STOPPED			0x00000004
#define __TASK_TRACED			0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD			0x00000010
#define EXIT_ZOMBIE			0x00000020

2.1 就绪态

进程会在以下几种情况下切换到TASK_RUNNING状态（不是全部，举例说明）：

进程刚被创建时。

当一个新进程被创建时，它的初始状态是TASK_RUNNING。

在早期的内核中，进程是通过_do_fork来进行创建进程，在v5.10中的commit将其更名为kernel_clone

commit cad6967ac10843a70842cd39c7b53412901dd21f
Author: Christian Brauner <brauner@kernel.org>
Date:   Wed Aug 19 12:46:45 2020 +0200

    fork: introduce kernel_clone()

在这个函数中

pid_t kernel_clone(struct kernel_clone_args *args)
{
  ......
	wake_up_new_task(p);
  ......
}

void wake_up_new_task(struct task_struct *p)
{
	struct rq_flags rf;
	struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	WRITE_ONCE(p->__state, TASK_RUNNING);
  ......
}

进程从等待状态（TASK_INTERRUPTIBLE或TASK_UNINTERRUPTIBLE）唤醒。

当进程等待某个事件（如I/O完成、信号量可用等）时，它会进入等待状态。当事件发生时，进程会被唤醒并切换到TASK_RUNNING状态。

比较常见的是通过__wake_up_common来唤醒进程，并继续执行相关的调度。

 static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
			int nr_exclusive, int wake_flags, void *key,
			wait_queue_entry_t *bookmark)
{
	wait_queue_entry_t *curr, *next;
	......
		ret = curr->func(curr, mode, wake_flags, key);
  ......
}

通常使用include/linux/wait.h中定义的.func = default_wake_function,来实现wake操作。

int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
		......
		trace_sched_waking(p);
		ttwu_do_wakeup(p);
    ......
}

static inline void ttwu_do_wakeup(struct task_struct *p)
{
	WRITE_ONCE(p->__state, TASK_RUNNING);
	trace_sched_wakeup(p);
}

进程从暂停状态（TASK_STOPPED或TASK_TRACED）恢复。

当进程因为接收到SIGSTOP、SIGTSTP、SIGTTIN或SIGTTOU信号而暂停时，它会进入TASK_STOPPED或TASK_TRACED状态。当进程接收到SIGCONT信号时，它会恢复运行并切换到TASK_RUNNING状态。

static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
	......
	} else if (sig == SIGCONT) {
		unsigned int why;
		/*
		 * Remove all stop signals from all queues, wake all threads.
		 */
		siginitset(&flush, SIG_KERNEL_STOP_MASK);
		flush_sigqueue_mask(&flush, &signal->shared_pending);
		for_each_thread(p, t) {
				......
				wake_up_state(t, __TASK_STOPPED);
				......
}

可以看到，这里就是从一个stop状态的进程切换回运行态。

执行正常调度的时候

这里是指，在调用schedule()对queue里的task进行调度时，会自然将进程切换为running态。

首先是在__schedule中，判断当前的进程（将要被切换走的）是不是有未处理的信号，如果有，则直接将这个切成runnnig
```
		if (signal_pending_state(prev_state, prev)) {
			WRITE_ONCE(prev->__state, TASK_RUNNING);
		} 
```
接下来，则是通过cfs，开始选择下一个执行的task，并在context_switch里更换为running状态。
```
next = pick_next_task(rq, prev, &rf);
......
rq = context_switch(rq, prev, next, &rf);
```
（gpt说的，根本没找到，这里感觉根本就不会，cfs在挑选的时候实在rq中选一个task，而rq中的task已经是running的了，gpt在胡扯）

2.2-TASK_INTERRUPTIBLE

常见于进程完成当前调度任务后主动进入休眠，也可以用与等待部分IO资源。

例如下列这种用法，vcpu线程在执行完当前周期后主动会陷入sleep，并让出CPU，等待下一次调度唤醒。

bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
{
	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
	bool waited = false;

	vcpu->stat.generic.blocking = 1;

	preempt_disable();
	kvm_arch_vcpu_blocking(vcpu);
	prepare_to_rcuwait(wait);
	preempt_enable();

	for (;;) {
		set_current_state(TASK_INTERRUPTIBLE); // 设置自身状态为TASK_INTERRUPTIBLE

		if (kvm_vcpu_check_block(vcpu) < 0)
			break;

		waited = true;
		schedule(); // 主动让出CPU，正式sleep
	}

	preempt_disable();
	finish_rcuwait(wait);
	kvm_arch_vcpu_unblocking(vcpu);
	preempt_enable();

	vcpu->stat.generic.blocking = 0;

	return waited;
}

可被signal唤醒。

2.3-TASK_UNINTERRUPTIBLE

主要是在等待资源，例如关键的IO资源，例如下列这段md_bitmap_startwrite部分函数

if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
    DEFINE_WAIT(__wait);
    /* note that it is safe to do the prepare_to_wait
     * after the test as long as we do it before dropping
     * the spinlock.
     */
    prepare_to_wait(&bitmap->overflow_wait, &__wait,
            TASK_UNINTERRUPTIBLE);
    spin_unlock_irq(&bitmap->counts.lock);
    schedule();
    finish_wait(&bitmap->overflow_wait, &__wait);
    continue;
}

有时在进程进入关键流程后，也会将其设置为TASK_UNINTERRUPTIBLE，比如在kthread_stop中等待进程完成并返回时，会将自身设置成TASK_UNINTERRUPTIBLE，避免被再次其他信号意外唤醒。

void __sched wait_for_completion(struct completion *x)
{
	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}

不可被signal唤醒，只能靠自身调度唤醒。

2.4-`EXIT_ZOMBIE`&`EXIT_DEAD`

当一个子进程结束执行（即完成它的任务或被终止），但其父进程尚未通过调用 wait() 系列函数来读取子进程的退出状态时，子进程会进入 EXIT_ZOMBIE 状态。在这种状态下，子进程的大部分资源都被释放了，但它在进程表中保留了一个条目，其中包含了退出状态、进程ID等信息，以便父进程稍后查询。在do_exit()中通知父进程的函数值会设置进程的僵尸态。

static void exit_notify(struct task_struct *tsk, int group_dead)
{
	bool autoreap;
	struct task_struct *p, *n;
	LIST_HEAD(dead);

	......

	tsk->exit_state = EXIT_ZOMBIE;
  ......
}

当父进程通过 wait() 系列函数读取了僵尸子进程的退出状态后，子进程的状态会变为 EXIT_DEAD。在这个状态下，子进程的进程表条目和剩余资源将被彻底释放和回收。在wait_task_zombie中会设置相关状态。

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
  ......
	if (state == EXIT_TRACE) {
		write_lock_irq(&tasklist_lock);
		/* We dropped tasklist, ptracer could die and untrace */
		ptrace_unlink(p);

		/* If parent wants a zombie, don't release it now */
		state = EXIT_ZOMBIE;
		if (do_notify_parent(p, p->exit_signal))
			state = EXIT_DEAD;
		p->exit_state = state;
		write_unlock_irq(&tasklist_lock);
	}
	if (state == EXIT_DEAD)
		release_task(p);
  ......
}

2.5-`__TASK_TRACED`&`__TASK_TRACED`

__TASK_TRACED进程被追踪之后的状态。

static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
	unsigned int state = 0;
	if (resume) {
		t->jobctl &= ~JOBCTL_TRACED;
		state = __TASK_TRACED;
	}
	signal_wake_up_state(t, state);
}

__TASK_STOPPED 状态表示进程已经停止执行，通常是因为收到了如 SIGSTOP、SIGTSTP、SIGTTIN 或 SIGTTOU 等信号。

例如do_signal_stop中会设置进程为set_special_state(TASK_STOPPED);

1、task_struct数据结构

2、进程状态

2.1 就绪态

2.2-TASK_INTERRUPTIBLE

2.3-TASK_UNINTERRUPTIBLE

2.4-EXIT_ZOMBIE&EXIT_DEAD

2.5-__TASK_TRACED&__TASK_TRACED

3、进程状态的转换

You might also like...

内核链表

kdump调试

内存屏障

系统调用

内核同步机制1

2.4-`EXIT_ZOMBIE`&`EXIT_DEAD`

2.5-`__TASK_TRACED`&`__TASK_TRACED`