Those tests will fail comparing rocblas vs openblas, because the testing program is so strict that it cannot tolerate the numerical differences which is actually OK. https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1202 --- orig/clients/gtest/known_bugs.yaml +++ rocBLAS-rocm-4.3.0/clients/gtest/known_bugs.yaml @@ -7,6 +7,16 @@ Known bugs: - { function: gemm_ex, a_type: i8_r, b_type: i8_r, c_type: i32_r, d_type: i32_r, compute_type: i32_r, flags: 0, known_bug_platforms: "gfx900,gfx906,gfx1010,gfx1011,gfx1012,gfx1030" } - { function: gemm_batched_ex, a_type: i8_r, b_type: i8_r, c_type: i32_r, d_type: i32_r, compute_type: i32_r, flags: 0, known_bug_platforms: "gfx900,gfx906,gfx90a,gfx1010,gfx1011,gfx1012,gfx1030" } - { function: gemm_strided_batched_ex, a_type: i8_r, b_type: i8_r, c_type: i32_r, d_type: i32_r, compute_type: i32_r, flags: 0, known_bug_platforms: "gfx900,gfx906,gfx1010,gfx1011,gfx1012,gfx1030" } +# gemv openblas reference differences due to summation order dependent roundoff accumulation with large M float complex +# 8th significant digit difference vs CPU on single precision float math, leads to expected equality test failure +# code needs to be changed to a tolerance test or reduce M for float complex type if using equality vs. CPU reference +- { function: gemv, a_type: f32_c, transA: T, M: 131071 } +- { function: gemv, a_type: f32_c, transA: C, M: 131071 } +- { function: gemv_batched, a_type: f32_c, transA: T, M: 131071 } +- { function: gemv_batched, a_type: f32_c, transA: C, M: 131071 } +- { function: gemv_strided_batched, a_type: f32_c, transA: T, M: 131071 } +- { function: gemv_strided_batched, a_type: f32_c, transA: C, M: 131071 } + #- { function: gemm_ex, a_type: bf16_r, b_type: bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: f32_r, transA: C, transB: N, M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 5.0, alphai: 0.0, beta: 0.0, betai: 0.0, known_bug_platforms: gfx908 } #- { function: gemm_ex, a_type: bf16_r, b_type: bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: f32_r, transA: C, transB: N, M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 0.0, alphai: 0.0, beta: 3.0, betai: 0.0, known_bug_platforms: gfx908 }