//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
//! instructions and native CRC calculation instructions on x86 / x86_64.
//!
//! https://www.corsix.org/content/fast-crc32c-4k
//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
//!
//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
//! with the help of Claude.ai.
//!
//! Modified as necessary for this Rust implementation.
//!
//! MIT licensed.

#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]

mod iscsi;

use iscsi::sse_pclmulqdq::crc32_iscsi_sse_v4s3x3;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
#[rustversion::since(1.89)]
use iscsi::avx512_pclmulqdq::crc32_iscsi_avx512_v4s3x3;
#[cfg(target_arch = "x86_64")]
#[rustversion::since(1.89)]
use iscsi::avx512_vpclmulqdq::crc32_iscsi_avx512_vpclmulqdq_v3x2;

/// CRC32 iSCSI calculation for Rust versions before 1.89 (pre-AVX-512 support)
///
///
/// This function is called by the wrapper layer after feature detection has been performed.
/// For older Rust versions, only SSE implementation is available.
#[rustversion::before(1.89)]
#[inline(always)]
pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
    let data_len = data.len();

    // SSE4.2 is required for native CRC32 instructions used in small buffer path
    if data_len <= 256 && is_x86_feature_detected!("sse4.2") {
        unsafe {
            return crc32_iscsi_small_fast(crc, data);
        }
    }

    // Only SSE implementation is available for Rust versions before 1.89
    // Runtime feature detection is handled by the wrapper layer
    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data_len) }
}

/// CRC32 iSCSI calculation using the highest available instruction set after Rust 1.89
/// (post-AVX-512 support)
///
/// This function is called by the wrapper layer after feature detection has been performed.
/// The wrapper layer ensures that only the appropriate implementation is called based on
/// cached feature detection results, removing runtime checks from the hot path.
#[rustversion::since(1.89)]
#[inline(always)]
pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
    let data_len = data.len();

    // SSE4.2 is required for native CRC32 instructions used in small buffer path
    if data_len <= 256 && is_x86_feature_detected!("sse4.2") {
        unsafe {
            return crc32_iscsi_small_fast(crc, data);
        }
    }

    #[cfg(target_arch = "x86_64")]
    {
        // AVX512 + VPCLMULQDQ

        if is_x86_feature_detected!("avx512vl") && is_x86_feature_detected!("vpclmulqdq") {
            unsafe {
                return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data_len);
            }
        }

        // AVX512
        if is_x86_feature_detected!("avx512vl") {
            unsafe {
                return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data_len);
            }
        }
    }

    // Fallback to SSE implementation
    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data_len) }
}

#[rustversion::since(1.89)]
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512vl,vpclmulqdq")]
unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
    _mm512_clmulepi64_epi128(a, b, 0)
}

#[rustversion::since(1.89)]
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512vl,vpclmulqdq")]
unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
    _mm512_clmulepi64_epi128(a, b, 17)
}

#[inline]
#[target_feature(enable = "pclmulqdq")]
unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
    _mm_clmulepi64_si128(a, b, 0)
}

#[inline]
#[target_feature(enable = "pclmulqdq")]
unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
    _mm_clmulepi64_si128(a, b, 17)
}

#[inline]
#[target_feature(enable = "pclmulqdq")]
unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
}

// x^n mod P, in log(n) time
#[target_feature(enable = "sse4.2,pclmulqdq")]
unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
    let mut stack = !1u64;
    let mut acc: u32;
    let mut low: u32;

    while n > 191 {
        stack = (stack << 1) + (n & 1);
        n = (n >> 1) - 16;
    }
    stack = !stack;
    acc = 0x80000000u32 >> (n & 31);
    n >>= 5;

    while n > 0 {
        // Use hardware CRC32C instruction
        acc = _mm_crc32_u32(acc, 0);
        n -= 1;
    }

    while {
        low = (stack & 1) as u32;
        stack >>= 1;
        stack != 0
    } {
        let x = _mm_cvtsi32_si128(acc as i32);
        let clmul_result = _mm_clmulepi64_si128(x, x, 0);
        let y = mm_extract_epi64(clmul_result, 0);
        acc = mm_crc32_u64(0, y << low);
    }
    acc
}

#[inline]
#[target_feature(enable = "pclmulqdq")]
unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
}

#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
    #[cfg(target_arch = "x86_64")]
    {
        if idx == 0 {
            _mm_cvtsi128_si64(val) as u64
        } else {
            _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
        }
    }
    #[cfg(target_arch = "x86")]
    {
        // On 32-bit x86, extract two 32-bit values and combine them
        let shifted = if idx == 0 {
            val
        } else {
            _mm_srli_si128(val, 8)
        };
        let low = _mm_cvtsi128_si32(shifted) as u32;
        let high = _mm_cvtsi128_si32(_mm_srli_si128(shifted, 4)) as u32;
        (low as u64) | ((high as u64) << 32)
    }
}

#[inline]
#[target_feature(enable = "sse4.2")]
unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
    #[cfg(target_arch = "x86_64")]
    {
        _mm_crc32_u64(crc.into(), val) as u32
    }
    #[cfg(target_arch = "x86")]
    {
        // On 32-bit x86, process 64-bit value as two 32-bit CRC operations
        let low = val as u32;
        let high = (val >> 32) as u32;
        let crc = _mm_crc32_u32(crc, low);
        _mm_crc32_u32(crc, high)
    }
}

/// CRC-32/ISCSI calculation for small buffers (<= 256 bytes) using unrolled native CRC instructions
#[inline]
#[target_feature(enable = "sse4.2")]
pub unsafe fn crc32_iscsi_small_fast(mut crc: u32, data: &[u8]) -> u32 {
    let (prefix, aligned, suffix) = data.align_to::<u64>();

    for &byte in prefix {
        crc = _mm_crc32_u8(crc, byte);
    }

    let mut chunks = aligned.chunks_exact(8);
    for chunk in &mut chunks {
        crc = mm_crc32_u64(crc, chunk[0]);
        crc = mm_crc32_u64(crc, chunk[1]);
        crc = mm_crc32_u64(crc, chunk[2]);
        crc = mm_crc32_u64(crc, chunk[3]);
        crc = mm_crc32_u64(crc, chunk[4]);
        crc = mm_crc32_u64(crc, chunk[5]);
        crc = mm_crc32_u64(crc, chunk[6]);
        crc = mm_crc32_u64(crc, chunk[7]);
    }

    for &val in chunks.remainder() {
        crc = mm_crc32_u64(crc, val);
    }

    for &byte in suffix {
        crc = _mm_crc32_u8(crc, byte);
    }

    crc
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::test::consts::TEST_CHECK_STRING;
    use crc::{Crc, Table};
    use rand::{rng, Rng};

    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);

    #[test]
    fn test_crc32_iscsi_check() {
        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            assert_eq!(
                crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
                0xe3069283
            );
        }
    }

    #[test]
    fn test_crc32_iscsi_small_fast_check() {
        if is_x86_feature_detected!("sse4.2") {
            unsafe {
                assert_eq!(
                    crc32_iscsi_small_fast(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
                    0xe3069283
                );
            }
        }
    }

    #[test]
    fn test_crc32_iscsi_small_fast_all_lengths() {
        if is_x86_feature_detected!("sse4.2") {
            for len in 1..=255 {
                test_crc32_iscsi_small_fast_random(len);
            }
        }
    }

    fn test_crc32_iscsi_small_fast_random(len: usize) {
        let mut data = vec![0u8; len];
        rng().fill(&mut data[..]);

        let checksum = RUST_CRC32_ISCSI.checksum(&data);

        if is_x86_feature_detected!("sse4.2") {
            unsafe {
                assert_eq!(
                    crc32_iscsi_small_fast(0xffffffff, &data) ^ 0xffffffff,
                    checksum
                );
            }
        }
    }

    #[test]
    fn test_crc32_iscsi_small_all_lengths() {
        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            for len in 1..=255 {
                test_crc32_iscsi_random(len);
            }
        }
    }

    #[test]
    fn test_crc32_iscsi_medium_lengths() {
        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            // Test each length from 256 to 1024, which should fold and include handling remainders
            for len in 256..=1024 {
                test_crc32_iscsi_random(len);
            }
        }
    }

    #[test]
    fn test_crc32_iscsi_large_lengths() {
        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            // Test 1 MiB just before, at, and just after the folding boundaries
            for len in 1048575..1048577 {
                test_crc32_iscsi_random(len);
            }
        }
    }

    #[rustversion::since(1.89)]
    fn test_crc32_iscsi_random(len: usize) {
        let mut data = vec![0u8; len];
        rng().fill(&mut data[..]);

        let checksum = RUST_CRC32_ISCSI.checksum(&data);

        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
        }

        unsafe {
            #[cfg(target_arch = "x86_64")]
            {
                if is_x86_feature_detected!("vpclmulqdq")
                    && is_x86_feature_detected!("avx512vl")
                    && is_x86_feature_detected!("avx512f")
                {
                    assert_eq!(
                        crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
                            ^ 0xffffffff,
                        checksum
                    );
                }

                if is_x86_feature_detected!("avx512vl")
                    && is_x86_feature_detected!("avx512f")
                    && is_x86_feature_detected!("pclmulqdq")
                {
                    assert_eq!(
                        crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len())
                            ^ 0xffffffff,
                        checksum
                    );
                }
            }

            assert_eq!(
                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
                checksum
            );
        }
    }

    #[rustversion::before(1.89)]
    fn test_crc32_iscsi_random(len: usize) {
        let mut data = vec![0u8; len];
        rng().fill(&mut data[..]);

        let checksum = RUST_CRC32_ISCSI.checksum(&data);

        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
            assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);

            unsafe {
                assert_eq!(
                    crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
                    checksum
                );
            }
        }
    }
}
