コタソートで配列を並び替える

コタソートを使用する

コタソート（KotaSort）は、ボトムアップのマージソートの枠組みで、長さ O(√n) 程度のブロック単位で併合し、併合のあとにブロック選択ソートでブロックの順序を整える。

グレイルソートが併合前にブロック選択を行い、ウィキソートが併合と同時にブロック選択を行うのに対し、コタソートは併合後にブロック選択を行う。ブロックマージのロジックが最も単純で実装しやすく、移動回数と併合比較が最適化されている。多方向ブロックマージへの一般化も可能である。

小さなラン整列: 長さ 16 程度以下の区間は挿入ソートで整える。
ボトムアップ併合: 隣接する整列済み部分列を段階的に倍化しながらマージする。
ブロック併合: 区間が十分大きくなると、長さ √n 程度のブロックに分割し、外部バッファを使って隣接ブロックを順にマージする。
ブロック選択: マージが終わったブロック列に対し、各ブロックの先頭要素をキーとして選択ソートを行い、ブロック全体の順序を決める。安定性のため、別のキーバッファでブロックの元の相対順序を追跡する。
キーの復元: 収集した一意なキー列（約 3√n 個）を挿入ソートし、全体へマージして整列を完了する。

procedure kota_sort(A)
  if length(A) < 16 then
    insertion_sort(A)
    return
  block_len = floor(sqrt(length(A)))
  keys = collect_unique_keys(A, about 3 * block_len)
  build_sorted_runs_bottom_up(A, block_len)
  level = block_len
  while level < length(A)
    for each adjacent pair of sorted runs of length level
      merge_blocks_with_buffer(A, block_len, buffer)
      block_select_sort(A, keys, block_len)   // Kota: selection after merge
    level = level * 2
  insertion_sort(keys)
  merge keys back into A

コタソートはバッファレス変種を持たないため、配列全体を完結させるには別のブロックマージソート（グレイルソートやウィキソートなど）との組み合わせが必要になる。キー数は他のブロックマージ変種と比べて最も多く、約 3√n 個を要する。

最悪計算量 O(n log n) の安定ソートで、ブロックマージ系の中では実装が単純である。

類似アルゴリズムとの相違点

グレイルソートは併合前、ウィキソートは併合と同時にブロック選択する。コタは併合後にブロック選択し、3 者のうちロジックが最も単純である。

計算時間量および空間計算量を計測する

Size	Average time	Maximum time	Average memory	Maximum memory
256	0.000006	0.000515	1666	1672
512	0.000012	0.000707	1669	1676
1024	0.000027	0.000476	1677	1684
2048	0.000062	0.000437	1698	1704
4096	0.000135	0.000481	1737	1744
8192	0.000296	0.000599	1821	1828
16384	0.000640	0.001047	1986	1992
32768	0.001386	0.003197	2306	2312
65536	0.003019	0.003812	2820	2824
131072	0.006439	0.012326	4096	4096
262144	0.014072	0.023600	6659	6768

計測に使用したコードを表示する

set -euo pipefail

WORKDIR="$(mktemp -d)"
trap 'rm -rf "$WORKDIR"' EXIT

cat > "$WORKDIR/Dockerfile" <<'EOF'
FROM rust:1.95.0

WORKDIR /app

RUN mkdir -p src

RUN cat > Cargo.toml <<'CARGO'
[package]
name = "rust-benchmark"
version = "0.1.0"
edition = "2021"

[profile.release]
lto = true
codegen-units = 1
panic = "abort"
CARGO

RUN cat > src/main.rs <<'RUST'
use std::{
    env,
    process::Command,
    time::{Duration, Instant},
};
const MIN_POWER: u32 = 8;
const MAX_POWER: u32 = 18;
const RUNS: usize = 8192;


fn kota_insertion_sort(a: &mut [usize], lo: usize, hi: usize) {
    for i in lo + 1..hi {
        let key = a[i];
        let mut j = i;
        while j > lo && a[j - 1] > key {
            a[j] = a[j - 1];
            j -= 1;
        }
        a[j] = key;
    }
}

fn kota_block_swap(a: &mut [usize], start: usize, block_len: usize, i: usize, j: usize) {
    let bi = start + i * block_len;
    let bj = start + j * block_len;
    for k in 0..block_len {
        a.swap(bi + k, bj + k);
    }
}

fn kota_block_select(a: &mut [usize], start: usize, block_count: usize, block_len: usize) {
    for i in 0..block_count {
        let mut min = i;
        for j in i + 1..block_count {
            if a[start + j * block_len] < a[start + min * block_len] {
                min = j;
            }
        }
        if min != i {
            kota_block_swap(a, start, block_len, i, min);
        }
    }
}

fn kota_merge_with_buffer(a: &mut [usize], lo: usize, mid: usize, hi: usize, buf: &mut Vec<usize>) {
    let left_len = mid - lo;
    buf.resize(left_len, 0);
    buf[..left_len].copy_from_slice(&a[lo..mid]);
    let mut i = 0usize;
    let mut j = mid;
    let mut k = lo;
    while i < left_len && j < hi {
        if buf[i] <= a[j] {
            a[k] = buf[i];
            i += 1;
        } else {
            a[k] = a[j];
            j += 1;
        }
        k += 1;
    }
    while i < left_len {
        a[k] = buf[i];
        i += 1;
        k += 1;
    }
}

fn kota_sort(a: &mut [usize]) {
    let n = a.len();
    if n <= 1 {
        return;
    }

    let run_size = 16usize;
    let block_len = (n as f64).sqrt() as usize;
    let block_len = block_len.max(1);

    if n < run_size {
        kota_insertion_sort(a, 0, n);
        return;
    }

    for start in (0..n).step_by(run_size) {
        let end = (start + run_size).min(n);
        kota_insertion_sort(a, start, end);
    }

    let mut merge_buf = Vec::new();
    let mut width = run_size;
    while width < n {
        for lo in (0..n).step_by(width * 2) {
            let mid = (lo + width).min(n);
            let hi = (lo + width * 2).min(n);
            if mid >= hi {
                continue;
            }
            kota_merge_with_buffer(a, lo, mid, hi, &mut merge_buf);
            let span = hi - lo;
            if span >= block_len * 2 {
                let block_count = span / block_len;
                kota_block_select(a, lo, block_count, block_len);
            }
        }
        width *= 2;
    }
}


fn benchmark_sort(array: &mut [usize]) {

    kota_sort(array);

}

fn is_non_decreasing(a: &[usize]) -> bool {
    a.windows(2).all(|w| w[0] <= w[1])
}

fn same_multiset(a: &[usize], b: &[usize]) -> bool {
    if a.len() != b.len() {
        return false;
    }

    let mut left = a.to_vec();
    let mut right = b.to_vec();
    left.sort_unstable();
    right.sort_unstable();
    left == right
}

fn check_correctness_case(label: &str, mut input: Vec<usize>) {
    let original = input.clone();

    benchmark_sort(&mut input);

    if !is_non_decreasing(&input) {
        panic!("correctness case {}: output is not sorted", label);
    }

    if !same_multiset(&input, &original) {
        panic!("correctness case {}: elements were lost or added", label);
    }
}

fn few_unique_values(size: usize, unique: usize, seed: u64) -> Vec<usize> {
    let mut state = seed;

    (0..size)
        .map(|_| {
            state ^= state << 13;
            state ^= state >> 7;
            state ^= state << 17;
            (state as usize % unique) + 1
        })
        .collect()
}

fn run_correctness_checks() {
    check_correctness_case("empty", vec![]);
    check_correctness_case("single", vec![42]);
    check_correctness_case("duplicates", vec![3, 1, 3, 2, 1, 2]);
    check_correctness_case("sorted", vec![1, 2, 3, 4, 5]);
    check_correctness_case("reverse", vec![5, 4, 3, 2, 1]);
    check_correctness_case("all_equal", vec![7, 7, 7, 7]);
    check_correctness_case("skewed_range", vec![1_000_000, 2, 1_000_001, 1, 999_999]);
    // Static-buffer Grail skips the in-buffer build when key collection is sparse
    // (ideal_buffer = false). Exercising that path catches regressions in buffer gating.
    check_correctness_case(
        "few_keys_len16",
        vec![2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 1, 2, 3, 4, 1, 4],
    );
    for seed in 0..32 {
        check_correctness_case(
            &format!("few_keys_len32_seed_{seed}"),
            few_unique_values(32, 4, seed),
        );
    }
}


fn shuffled(size: usize, seed: u64) -> Vec<usize> {
    let mut v: Vec<usize> = (1..=size).collect();

    let mut state = seed;

    for i in (1..size).rev() {
        state ^= state << 13;
        state ^= state >> 7;
        state ^= state << 17;

        let j = (state as usize) % (i + 1);

        v.swap(i, j);
    }

    v
}

fn memory_usage_kb() -> usize {
    // VmHWM (peak RSS, KiB). Reported memory subtracts a per-size baseline that only
    // holds the input array, so the table reflects auxiliary space during sorting.
    let contents = std::fs::read_to_string("/proc/self/status")
        .unwrap_or_default();

    for line in contents.lines() {
        if let Some(rest) = line.strip_prefix("VmHWM:") {
            let kb = rest
                .split_whitespace()
                .next()
                .unwrap_or("0")
                .parse::<usize>()
                .unwrap_or(0);

            return kb;
        }
    }

    0
}

fn micros(d: Duration) -> u128 {
    d.as_micros()
}

fn input_array(size: usize, seed: u64) -> Vec<usize> {
    shuffled(size, seed)
}

fn run_baseline(size: usize) -> usize {
    let _hold = input_array(size, 1);
    memory_usage_kb()
}

fn run_once(size: usize, seed: usize) -> (u128, usize) {
    let mut array = input_array(size, seed as u64);

    let start = Instant::now();

    benchmark_sort(&mut array);

    let elapsed = start.elapsed();
    let mem = memory_usage_kb();

    let expected: Vec<usize> = (1..=size).collect();
    if array != expected {
        panic!(
            "sort failed with seed {} for size {}",
            seed,
            size
        );
    }

    (micros(elapsed), mem)
}

fn run_baseline_child(args: &[String]) {
    let size = args[2].parse::<usize>().expect("invalid size");
    let mem = run_baseline(size);
    println!("{}", mem);
}

fn run_child(args: &[String]) {
    let size = args[2].parse::<usize>().expect("invalid size");
    let seed = args[3].parse::<usize>().expect("invalid seed");
    let (elapsed_us, mem) = run_once(size, seed);
    println!("{} {}", elapsed_us, mem);
}

fn main() {
    let args: Vec<String> = env::args().collect();
    if args.get(1).is_some_and(|arg| arg == "--baseline-once") {
        run_baseline_child(&args);
        return;
    }
    if args.get(1).is_some_and(|arg| arg == "--run-once") {
        run_child(&args);
        return;
    }

    run_correctness_checks();

    println!(
        "| {:>10} | {:>15} | {:>15} | {:>15} | {:>15} |",
        "Size",
        "Average time",
        "Maximum time",
        "Average memory",
        "Maximum memory"
    );

    println!(
        "|{:-<11}:|{:-<16}:|{:-<16}:|{:-<16}:|{:-<16}:|",
        "",
        "",
        "",
        "",
        ""
    );

    for power in MIN_POWER..=MAX_POWER {
        let size = 1usize << power;

        let baseline_output = Command::new(env::current_exe().expect("failed to find current executable"))
            .arg("--baseline-once")
            .arg(size.to_string())
            .output()
            .expect("failed to run benchmark baseline process");

        if !baseline_output.status.success() {
            panic!(
                "benchmark baseline process failed: {}",
                String::from_utf8_lossy(&baseline_output.stderr)
            );
        }

        let baseline_stdout = String::from_utf8(baseline_output.stdout)
            .expect("baseline process returned non-UTF-8 output");
        let baseline_mem = baseline_stdout
            .split_whitespace()
            .next()
            .expect("missing baseline memory usage")
            .parse::<usize>()
            .expect("invalid baseline memory usage");

        let mut total_time: u128 = 0;
        let mut max_time: u128 = 0;

        let mut total_mem: usize = 0;
        let mut max_mem: usize = 0;

        for seed in 1..=RUNS {
            let output = Command::new(env::current_exe().expect("failed to find current executable"))
                .arg("--run-once")
                .arg(size.to_string())
                .arg(seed.to_string())
                .output()
                .expect("failed to run benchmark child process");

            if !output.status.success() {
                panic!(
                    "benchmark child process failed: {}",
                    String::from_utf8_lossy(&output.stderr)
                );
            }

            let stdout = String::from_utf8(output.stdout)
                .expect("child process returned non-UTF-8 output");
            let mut fields = stdout.split_whitespace();
            let elapsed_us = fields
                .next()
                .expect("missing elapsed time")
                .parse::<u128>()
                .expect("invalid elapsed time");
            let mem = fields
                .next()
                .expect("missing memory usage")
                .parse::<usize>()
                .expect("invalid memory usage");

            total_time += elapsed_us;

            if elapsed_us > max_time {
                max_time = elapsed_us;
            }

            let aux_mem = mem.saturating_sub(baseline_mem);

            total_mem += aux_mem;

            if aux_mem > max_mem {
                max_mem = aux_mem;
            }
        }

        let avg_time = total_time / RUNS as u128;
        let avg_mem = total_mem / RUNS;

        println!(
            "| {:>10} | {:>15} | {:>15} | {:>15} | {:>15} |",
            size,
            format!("{}.{:06}", avg_time / 1_000_000, avg_time % 1_000_000),
            format!("{}.{:06}", max_time / 1_000_000, max_time % 1_000_000),
            avg_mem,
            max_mem
        );
    }
}
RUST

RUN cargo build --release

CMD ["./target/release/rust-benchmark"]
EOF

docker build -t rust-benchmark "$WORKDIR"
docker run --rm --init rust-benchmark