scx_simple

sched_ext

./tools/sched_ext/*.bpf.cにBPFの実装がある。
*.cはユーザー空間のローダーであり、中にはロジック本体を担うやつもありそう

scx_simple.bpf.c

this scheduler should work reasonably well on CPUs with a uniform L3 cache topology
私が使ってるRyzen 9 9955HXでは、CCD(Core Complex Die)が二つあるので、scx_simpleは適してないかも。Ryzen 9 9955HXのlstopo
scx_simpleのハンドラ. 必須なのは.nameのみであり、そのほかは欠けている場合にはscxのデフォルト実装が利用される。

SCX_OPS_DEFINE(simple_ops,
	       .select_cpu		= (void *)simple_select_cpu,
	       .enqueue			= (void *)simple_enqueue,
	       .dispatch		= (void *)simple_dispatch,
	       .running			= (void *)simple_running,
	       .stopping		= (void *)simple_stopping,
	       .enable			= (void *)simple_enable,
	       .init			= (void *)simple_init,
	       .exit			= (void *)simple_exit,
	       .name			= "simple");

simple_select_cpu
- scx_bpf_select_cpu_dfl()はidleCPUがなかったらprev_cpuを返す
- idle CPUがある場合には、そのCPUのlocal DSQに直接enqueueする
- idle CPUがない場合には、scx coreへのヒントとしてcpu(prev_cpu)を返す

s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
	bool is_idle = false;
	s32 cpu;

	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
	if (is_idle) {
		stat_inc(0);	/* count local queueing */
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
	}

	return cpu;
}

simple_enqueue
- fifoの場合: 何もせず独自global DSQにenqueueする
- vtime考慮の場合: vtimeを設定した上で独自global DSQにenqueueする
  - なお、vtime(これまで消費した重みつきCPU時間)が短いほど優先される。
    - vtimeの初期値は、task生成時のvtime_nowであり、これは相対的な値
  - vtimeをkeyとしてpriority queue(rb tree)で管理されている
  - この時、vtimeが他のタスクと比べてSCX_SLICE_DFL以上に短い場合には、vtimeの差が SCX_SLICE_DFLとなるように、vtimeを増やす。つまり、次回以降に必要以上にスケジュールされやすくするのを防ぐ
  - sleepしてた状態のタスクが貯金できる予算を1 slice分に制限する

void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
{
	stat_inc(1);	/* count global queueing */

	if (fifo_sched) {
		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
	} else {
		u64 vtime = p->scx.dsq_vtime;

		/*
		 * Limit the amount of budget that an idling task can accumulate
		 * to one slice.
		 */
		if (time_before(vtime, vtime_now - SCX_SLICE_DFL))
			vtime = vtime_now - SCX_SLICE_DFL;

		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
					 enq_flags);
	}
}

simple_dispatch
- dispatchをcallしたCPUのlocal DSQに、独自 global DSQ からタスクを一つ移動させる

void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
{
	scx_bpf_dsq_move_to_local(SHARED_DSQ);
}

simple_running
- taskがCPUで実行され始めた時に実行される
- taskのvtimeがvtime_now(global variable)より大きい場合には、vtime_nowを更新する
- fifoの場合には何もしない

void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
{
	if (fifo_sched)
		return;

	/*
	 * Global vtime always progresses forward as tasks start executing. The
	 * test and update can be performed concurrently from multiple CPUs and
	 * thus racy. Any error should be contained and temporary. Let's just
	 * live with it.
	 */
	if (time_before(vtime_now, p->scx.dsq_vtime))
		vtime_now = p->scx.dsq_vtime;
}

simple_stopping
- taskがCPUから剥がれた時に実行される
- vtimeを加算する処理を行う。scx.sliceは残りのtime slice
  - デフォルトではscx.sliceは0なので、頻繁にyieldするタスクに不利
- fifoの場合には何もしない

void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
{
	if (fifo_sched)
		return;

	/*
	 * Scale the execution time by the inverse of the weight and charge.
	 *
	 * Note that the default yield implementation yields by setting
	 * @p->scx.slice to zero and the following would treat the yielding task
	 * as if it has consumed all its slice. If this penalizes yielding tasks
	 * too much, determine the execution time by taking explicit timestamps
	 * instead of depending on @p->scx.slice.
	 */
	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
}

simple_enable
- タスクがsched_extの管理下にはいる時に実行される
  - 新しいタスクが生成された時

void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
{
	p->scx.dsq_vtime = vtime_now;
}

simple_init
- scx scheduler登録時に実行される
- scx_simple専用の独自DSQを作る

s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
{
	return scx_bpf_create_dsq(SHARED_DSQ, -1);
}

simple_exit
- scx scheduler登録解除時に実行される
- UEIはUser Exit Info らしい。
  - scx schedulerが終了した理由をuserspaceに伝えるための仕組み
- userspaceからはueiをBPF Map経由で読める

void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
{
	UEI_RECORD(uei, ei);
}

#define UEI_DEFINE(__name)							\
	char RESIZABLE_ARRAY(data, __name##_dump);				\
	const volatile u32 __name##_dump_len;					\
	struct user_exit_info __name SEC(".data")

#define UEI_RECORD(__uei_name, __ei) ({						\
	bpf_probe_read_kernel_str(__uei_name.reason,				\
				  sizeof(__uei_name.reason), (__ei)->reason);	\
	bpf_probe_read_kernel_str(__uei_name.msg,				\
				  sizeof(__uei_name.msg), (__ei)->msg);		\
	bpf_probe_read_kernel_str(__uei_name##_dump,				\
				  __uei_name##_dump_len, (__ei)->dump);		\
	if (bpf_core_field_exists((__ei)->exit_code))				\
		__uei_name.exit_code = (__ei)->exit_code;			\
	/* use __sync to force memory barrier */				\
	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
				    (__ei)->kind);				\
})

Appendix

RSSハッシュキーによる動作の変化を確認する

動機

igcのRSSハッシュキーの設定が行えるようにするパッチを出したので、実際に動作が変わっているかどうかを確認したい。

前提

RSSが有効なNICがパケットを受信した際、複数存在するキューのうちどのキューにパケットを積むかという部分に注目する。端的に言うと、NICは受信したパケットの要素(IPアドレス、ポートなど)を抽出し、あらかじめ設定されたハッシュキーとハッシュ関数を用いてハッシュ値を計算し、そのハッシュ値をRETA(Redirection table, 別名 Indirection table)のサイズで割った剰余をインデックスとして、RETAから対応するキュー番号を選択する。

詳細についてはDPDKのブログやIntelのPDFに詳しく載っている。

詳しく知りたい方は、RSS Toeplitzなどでググっていただくと、RSSハッシュキーなどを調整してRSSの動作を予測可能にするといった、ある程度高度な情報も見つかるので面白いと思う。

実験

実際にパケットを受信して、以下の事項を確かめる

デフォルトのランダムなハッシュキー環境下で、十分に分散が行われていること
値が全て0のハッシュキーをもつ極端な環境下で、パケットが一つのキューに集中すること

実験では以下の条件を利用する。

デバイス: I226-V
ドライバ: igc
キュー数: 4
ハッシュ関数: Toeplitz
ハッシュキーのサイズ: 40 bytes
ハッシュ計算に用いるフィールド: IP address (src, dst), UDP port (src, dst)

なお、igcでは、デフォルトではハッシュ計算の入力にIP address (src, dst)のみを利用するようになっているため、あらかじめ以下のようなコマンドでUDP port (src, dst)もハッシュ計算に利用するように設定しておく

# ethtool -N enp0s5 rx-flow-hash udp4 sdfn

上記の環境にて、UDP port (src, dst)がランダムな1000個のパケットを受信する。パケット生成側のコードは適当。

import socket
import time
import random


def send_udp(dst_port: int, dst_address):
    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    sock.sendto(b"Hello", (dst_address, dst_port))
    sock.close()


if __name__ == "__main__":
    times = 1000
    dst_address = "100.100.100.154"

    for i in range(times):
        send_udp(random.randint(3000, 65000), dst_address)
        time.sleep(0.001)

1. デフォルトのランダムなハッシュキー環境

大抵のドライバではデフォルトのハッシュキーはランダムになっており(see netdev_rss_key_fill())、また通常RETAも分散しているため、ハッシュ関数への入力が分散している場合には、選択されるキューもある程度分散されることが期待される。

実践 fault injection framework

fault injection framework の存在自体は認知していたものの、実際に手を動かして試してみるといったことをしないまま日々を過ごしていたところ、ひょんなことからその利便性を体験することができたので、実際の利用方法や活用できそうなシナリオ、Tips (とよべるかどうかわからない情報) を残しておきます。

なお、一次情報はカーネルのソース・ドキュメントに十分存在するため、網羅性については諦めています。

fault injection framework とは

端的には「実行タイミング」における障害注入フレームワークであり、主に debugfs をインターフェイスとしてカーネルの実行時に注入の有無や条件を設定することが可能です。

対照的には、以下の最も単純な例に示すようにビルドタイミングにて障害注入を行うことも可能ですが、この極めてナイーブな例では条件を変える度にビルドが必要となるなど、柔軟性に難点があります。
もちろん、何かしらのインターフェイスで操作可能な custom_should_fail() のような関数を用意して注入することも可能だとは思いますが、そこまでしたいなら素直に fault injection framework に乗っかるのが良いでしょう。

int foo() {
    ...
-   if (bar())
+   if (true || bar())
        return -ENOMEM;
    ...

    return 0;
}

fault injection framework には、失敗させたい操作ごとに異なるトリガーが用意されています。

failslab: slab allocation (kmalloc() などの kmem_cache_alloc() を叩くやつ) を失敗させる
fail_page_alloc: page allocation を失敗させる
fail_usercopy: ユーザー空間、カーネル空間におけるメモリのコピーを失敗させる
fail_function: ビルド時に予めマークされている関数の戻り値にエラーを注入する
そのほか多数

なお、基本的な利用方法のほとんどは fail* で共通であるため、今回はたまたま利用することになった failslab のみに焦点を当てることとします。網羅的なガイドはConfigure fault-injection capabilities behaviorを参照ください。

Blog - enjuk

scx_simple

sched_ext

scx_simple.bpf.c

Appendix

RSS Hash key の効果効用

RSSハッシュキーによる動作の変化を確認する

動機

前提

実験

1. デフォルトのランダムなハッシュキー環境

実践 fault injection framework

実践 fault injection framework

fault injection framework とは

More

うわっ・・・わたしのロードアベレージ低すぎ・・・？