gentoo auto-resync : 07:08:2024 - 12:37:20

author: V3n3RiX <venerix@koprulu.sector> 2024-08-07 12:37:21 +0100
committer: V3n3RiX <venerix@koprulu.sector> 2024-08-07 12:37:21 +0100
commit: b8c7370a682e4e29cda623222d17a790c01c3642 (patch)
tree: f6caa14689bd00a5760eadaa381ff41e50ef3c1b /sci-libs/caffe2
parent: 8a4997a7e2d1e36c089d4d76935b5a902d98d3d0 (diff)
8 files changed, 400 insertions, 28 deletions
diff --git a/sci-libs/caffe2/Manifest b/sci-libs/caffe2/Manifest
index 79cf2c8c93df..22ad33c0966d 100644
--- a/sci-libs/caffe2/Manifest
+++ b/sci-libs/caffe2/Manifest
@@ -10,11 +10,16 @@ AUX caffe2-2.2.1-gentoo.patch 6765 BLAKE2B 924338e5823825d18220c33e9168f96b59873
 AUX caffe2-2.2.2-musl.patch 363 BLAKE2B 9c62b8d93b430cec9d0e739802d5938933109369c003eda74fe1242d5bb61c50c70dd1cc52aa56d6b79c08f55328b991c8fafce60fa94d9377e84ddf14ab8d4a SHA512 2568001959399d76ce8a45e08dd54c0b297fc02a856b4d8a4003aa4dc12f5ded3e821022214df4997d4bd9de7515e0d2ebe2e465dca574b773155b8f9f5080f8
 AUX caffe2-2.3.0-CMakeFix.patch 519 BLAKE2B 0249b7c31cea647a0c82d94cd99fac3415cbd357aacde4d0cc0e5b936c27dc96afcbecdb9ae55ee464080603c9c71f6c995166ae7045e19e9d874dcf688b4a0a SHA512 a8c5200abf4f059a2c43a2fa0b2a639dde60cbf48f6bb461c133e02b7986e8bc9bb98c4a1c51478308ac13b886691c89debf0b762fc821cd52500f5648c3bd39
 AUX caffe2-2.3.0-cudnn_include_fix.patch 587 BLAKE2B 8ffeb9080ee77b953e7a77c9bea9af8c078adf147c314e07cf40d6f9ce1c988ac201e282ffd67a29703ced5885329d44be013c54cf3ba870c009aed40e65fefc SHA512 b37427e35d8147e603331eb344a3542ed31d0b133df3c7ce10ebecec93b1d09d040f77c33f23f70e9835db10fa209c0318b89b70b3b91263655d160ed737d6f3
+AUX caffe2-2.3.0-exclude-aotriton.patch 1380 BLAKE2B 603b6be7d093726a4ebb2f23b3413f6bdc360232614fddb2a1bf73d3f311d5476127340b084796213f1091eed7b8733dbc129df213f7e0eb9d1bb04e0541995a SHA512 173297bba5287ba7ffd0e6f61364f747fcf98a66990b69f1d234e41dc23ae4f645d4c00759a92bf624a691a0d16cbd1e52c45aeace1c19c06ff7f8e676d67df7
+AUX caffe2-2.3.0-fix-gcc-clang-abi-compat.patch 822 BLAKE2B ca2ad94c4293e120fa93cc535f295dee07da9cd3d98c10af57688a04daede8bb2e17dc7f91e88c937215866e1c65ea1f0a6c20fac02c00626a6ad3d2255089b7 SHA512 149ae161f0224d3500ae928a4077ae625bbc76853f1049c441ea12bf44ae95a18ba57aeba9b879df0a944fc88151290b49e7323e954d32eaf0ec0e520b77ad47
+AUX caffe2-2.3.0-fix-libcpp.patch 1281 BLAKE2B 67943ec6e79327c854ebcd3538f68dd2866530f8799399605a288b21477a3ea9673ad85469d04510347736d656135f7cec90a3254ea0e7572dde3e5be948b6a2 SHA512 b92b80b341dadd43a6a8d75a14a2e5325a2bbbacbb55c960f1c55b50a87e0a5f9298e9cf4faeb3b2684985d6fb439a5f8b908c0227cbf2baed9c9f5b29aa3d48
+AUX caffe2-2.3.0-fix-rocm-gcc14-clamp.patch 1009 BLAKE2B ad27422dfc7be2720b972e1bbc417874a1060d2cbd5edb0acf166a7437963702b830766950db33381d6f32c6f95a001d8424ece73f70603fae8bf5f50b2ba255 SHA512 67f26127632cfca91389fdc60cea4d31a9b259e5547ccd6778bfc98a19bdd7b632ec17abbe888842ff613430c49aac37e18aca441335a947f973fd9d978ac3fe
+AUX caffe2-2.3.0-optional-hipblaslt.patch 10001 BLAKE2B a73913a9d82acfb780ad95fc1aabef1dfbd20243a8caef7136dbdad72a120c8778348d82e25d9171453e159580caf1ec5fdaa9bdbef4a09981721579e50f6b21 SHA512 222d33a3253d35c64dc151d12a42a9a0d4edca7fe60e1bb9b0c43df07292a9a061b2259a5696b4d568db265b0df4f065b3ee54672481c788ce1e2b0ae01b8488
 AUX caffe2-2.3.0-rocm-fix-std-cpp17.patch 3378 BLAKE2B 9e88fa1bf68c397c8122ea5b3504a22b3f6ef92c77dad8bd84ee03b4f75792b0e1281d8b1aa981ad1bf65060179fa08ef14e776e82abdec9147dfbb3bf37a7ae SHA512 7797a140abf736f2a4628cd727cf0c58ed39c9764b9ce3b67d17fc0c9b9965e647266c815e5322f96f807680120e25ccdbbc66b66c7c6cf84edb811330ad452c
 DIST pytorch-2.2.2.tar.gz 116367503 BLAKE2B 0be22f2ec4b9aac6f5e976664cae01facf07929a32565cd57d7cc5b2d9888e9ae71ca301853752fe8f31d174d04c9974eb9ed2f3d452360a50ccf024f200726a SHA512 7990e0f9484038c3458c0bda2c863bf2b19e56edab81fc5938c6e0f08b17558287f853bb67350e8cca8f42bec0f1d4ba0e94e50a145db8da44bdd4bd703d91d0
 DIST pytorch-2.3.0.tar.gz 117029829 BLAKE2B 8f9c0d71ee0a9219b495eddccdcc65107f7ad537c43c68100b229f3d27b0e6c01ccb1659c7fffc356a48d80f2adc0a10361305dc8f1df20446de837d380f89f6 SHA512 67f7e9a096c3ffb952206ebf9105bedebb68c24ad82456083adf1d1d210437fcaa9dd52b68484cfc97d408c9eebc9541c76868c34a7c9982494dc3f424cfb07c
 DIST pytorch-2.3.1.tar.gz 117035696 BLAKE2B d419d7fa1342f1fb317ffce09ec9dc1447414627cc83d36578fe60f68c283c620b2b4d49f414cd206d537b90b16432a06cd1941662720db05d5e2b6c493325f5 SHA512 e1bcae44f9939fc7ccb1360a9b1970d92426f25e5de73e36964df3dd15ad5d8d9f5bd2f9a7dda6b8f64e2bba3674005bd869f542489cc442ad0125a02676f587
 EBUILD caffe2-2.2.2-r1.ebuild 7452 BLAKE2B 31ef525960d7c3866580985f9ba9736e5419f17c2a63251b8c4fe961a6789a33dee746b14b12fa331c8c20a6c821565b92dd2cb7c6cdd61678bba7ddf5fc7400 SHA512 f232d901c08e3fefd0d6260d07915786747df8f83269c3980c16112c968114c3cdb7daccd8132a6a3850a266a04b38f6df6d2b1f1bc35f473fb9a58d5fff3452
-EBUILD caffe2-2.3.0-r3.ebuild 7542 BLAKE2B 6b230db4d2a033b5078e8c61e40955a0981aa3b6b06734145e4bec8f78719329a3825218045a8f57efce9bfddad7b252ee4447d82cc0637db8656d5f8178b9cc SHA512 d3861111081d26a883790e7a5097ddf77ef2e491eb4471a3cc95d25e659ad8c283c22bda5674cb09d575768da5b6a401ea5347febd4f194dbedd4196d1ac628d
-EBUILD caffe2-2.3.1.ebuild 7563 BLAKE2B 17433295ef6aaf94567a5261e9df6a2b5aeac0b2c18eb4c45375ed828cbba7ac874811c0da8ca51de84482d5e4b62b48ccecd8924ad42c72a1320ff8b3b715c6 SHA512 c6868b614f7a9c3847e32ba054d821e96486d57cdf1d4597d3a6bb6f9045bd19fa65d5462af31c471eafddd4d5e72278d68de59c21cb21ab17df381fd181fe10
+EBUILD caffe2-2.3.0-r3.ebuild 8441 BLAKE2B 7b7c44a04e072fbf4bcc4fabf470b5256b85b36bb5035a56ceef65843dcaa1eec1eb3cdf9246cf3c672b6d3f4885816a037ef875f77bbf38f9d5a0fafda7981c SHA512 34306a2af160fef5ad902d116da044e96d4eaafd46b8ea511f7b04ef590ccdace7deabeb8905a5b69d9e1a21fc58ab038356d4542dd378c21078a75d08c188ca
+EBUILD caffe2-2.3.1.ebuild 8462 BLAKE2B 1c079bd3119ad0acd224a99cc029ba1e8d557065bbd2cfa30670f6f3720072520a072d0abed1e21f41781581f0b6711e809581fe01e363b613e2b82e38635d94 SHA512 591bee9c3a6beaa9f1687881381692bd7de32a6203d924d36ab89c2fc7de754fd7a5a4056e0e19f293aa836975d1d9321ee5997939270ff10df41fd31e5c9157
 MISC metadata.xml 1225 BLAKE2B ab7fb0bf8b2d37ddaa1a9ecc815eb094e85465d20d3a30af081b42e0b60ade9858d0053b101ba0e7750a90cb48b5b79db9bdc2729bf66d0420732489da62fe54 SHA512 dfb58597fb4bcdd7df0fcc3f2514518e118e8fc9b1cd24868aab60c32a62ff419b8b72a7c294925eff4c8871cc8df606af7fa60bfa99901091d8195101ee1153
diff --git a/sci-libs/caffe2/caffe2-2.3.0-r3.ebuild b/sci-libs/caffe2/caffe2-2.3.0-r3.ebuild
index c01e904d8eb0..666800d8f4b6 100644
--- a/sci-libs/caffe2/caffe2-2.3.0-r3.ebuild
+++ b/sci-libs/caffe2/caffe2-2.3.0-r3.ebuild
@@ -4,7 +4,7 @@
 EAPI=8
 
 PYTHON_COMPAT=( python3_{10..12} )
-ROCM_VERSION=5.7
+ROCM_VERSION=6.1
 inherit python-single-r1 cmake cuda flag-o-matic prefix rocm
 
 MYPN=pytorch
@@ -65,18 +65,23 @@ RDEPEND="
 	opencv? ( media-libs/opencv:= )
 	qnnpack? ( sci-libs/QNNPACK )
 	rocm? (
-		>=dev-util/hip-5.7
-		>=dev-libs/rccl-5.7[${ROCM_USEDEP}]
-		>=sci-libs/rocThrust-5.7[${ROCM_USEDEP}]
-		>=sci-libs/rocPRIM-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipBLAS-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipFFT-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipSPARSE-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipRAND-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipCUB-5.7[${ROCM_USEDEP}]
-		>=sci-libs/hipSOLVER-5.7[${ROCM_USEDEP}]
-		>=sci-libs/miopen-5.7[${ROCM_USEDEP}]
-		>=dev-util/roctracer-5.7[${ROCM_USEDEP}]
+		=dev-util/hip-6.1*
+		=dev-libs/rccl-6.1*[${ROCM_USEDEP}]
+		=sci-libs/rocThrust-6.1*[${ROCM_USEDEP}]
+		=sci-libs/rocPRIM-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipBLAS-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipFFT-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipSPARSE-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipRAND-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipCUB-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipSOLVER-6.1*[${ROCM_USEDEP}]
+		=sci-libs/miopen-6.1*[${ROCM_USEDEP}]
+		=dev-util/roctracer-6.1*[${ROCM_USEDEP}]
+
+		amdgpu_targets_gfx90a? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx90a] )
+		amdgpu_targets_gfx940? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx940] )
+		amdgpu_targets_gfx941? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx941] )
+		amdgpu_targets_gfx942? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx942] )
 	)
 	distributed? ( sci-libs/tensorpipe[cuda?] )
 	xnnpack? ( >=sci-libs/XNNPACK-2022.12.22 )
@@ -111,6 +116,11 @@ PATCHES=(
 	"${FILESDIR}"/${P}-rocm-fix-std-cpp17.patch
 	"${FILESDIR}"/${PN}-2.2.2-musl.patch
 	"${FILESDIR}"/${P}-CMakeFix.patch
+	"${FILESDIR}"/${PN}-2.3.0-exclude-aotriton.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-rocm-gcc14-clamp.patch
+	"${FILESDIR}"/${PN}-2.3.0-optional-hipblaslt.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-libcpp.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-gcc-clang-abi-compat.patch
 )
 
 src_prepare() {
@@ -235,11 +245,20 @@ src_configure() {
 		)
 	elif use rocm; then
 		export PYTORCH_ROCM_ARCH="$(get_amdgpu_flags)"
+		local use_hipblaslt="OFF"
+		if use amdgpu_targets_gfx90a || use amdgpu_targets_gfx940 || use amdgpu_targets_gfx941 \
+			|| use amdgpu_targets_gfx942; then
+			use_hipblaslt="ON"
+		fi
 
 		mycmakeargs+=(
 			-DUSE_NCCL=ON
 			-DUSE_SYSTEM_NCCL=ON
+			-DUSE_HIPBLASLT=${use_hipblaslt}
 		)
+
+		# ROCm libraries produce too much warnings
+		append-cxxflags -Wno-deprecated-declarations -Wno-unused-result
 	fi
 
 	if use onednn; then
diff --git a/sci-libs/caffe2/caffe2-2.3.1.ebuild b/sci-libs/caffe2/caffe2-2.3.1.ebuild
index 51bab7c5dac7..ee1da28aa12f 100644
--- a/sci-libs/caffe2/caffe2-2.3.1.ebuild
+++ b/sci-libs/caffe2/caffe2-2.3.1.ebuild
@@ -4,7 +4,7 @@
 EAPI=8
 
 PYTHON_COMPAT=( python3_{10..12} )
-ROCM_VERSION=5.7
+ROCM_VERSION=6.1
 inherit python-single-r1 cmake cuda flag-o-matic prefix rocm
 
 MYPN=pytorch
@@ -65,18 +65,23 @@ RDEPEND="
 	opencv? ( media-libs/opencv:= )
 	qnnpack? ( sci-libs/QNNPACK )
 	rocm? (
-		=dev-util/hip-5.7*
-		=dev-libs/rccl-5.7*[${ROCM_USEDEP}]
-		=sci-libs/rocThrust-5.7*[${ROCM_USEDEP}]
-		=sci-libs/rocPRIM-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipBLAS-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipFFT-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipSPARSE-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipRAND-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipCUB-5.7*[${ROCM_USEDEP}]
-		=sci-libs/hipSOLVER-5.7*[${ROCM_USEDEP}]
-		=sci-libs/miopen-5.7*[${ROCM_USEDEP}]
-		=dev-util/roctracer-5.7*[${ROCM_USEDEP}]
+		=dev-util/hip-6.1*
+		=dev-libs/rccl-6.1*[${ROCM_USEDEP}]
+		=sci-libs/rocThrust-6.1*[${ROCM_USEDEP}]
+		=sci-libs/rocPRIM-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipBLAS-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipFFT-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipSPARSE-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipRAND-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipCUB-6.1*[${ROCM_USEDEP}]
+		=sci-libs/hipSOLVER-6.1*[${ROCM_USEDEP}]
+		=sci-libs/miopen-6.1*[${ROCM_USEDEP}]
+		=dev-util/roctracer-6.1*[${ROCM_USEDEP}]
+
+		amdgpu_targets_gfx90a? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx90a] )
+		amdgpu_targets_gfx940? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx940] )
+		amdgpu_targets_gfx941? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx941] )
+		amdgpu_targets_gfx942? ( =sci-libs/hipBLASLt-6.1*[amdgpu_targets_gfx942] )
 	)
 	distributed? ( sci-libs/tensorpipe[cuda?] )
 	xnnpack? ( >=sci-libs/XNNPACK-2022.12.22 )
@@ -111,6 +116,11 @@ PATCHES=(
 	"${FILESDIR}"/${PN}-2.3.0-rocm-fix-std-cpp17.patch
 	"${FILESDIR}"/${PN}-2.2.2-musl.patch
 	"${FILESDIR}"/${PN}-2.3.0-CMakeFix.patch
+	"${FILESDIR}"/${PN}-2.3.0-exclude-aotriton.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-rocm-gcc14-clamp.patch
+	"${FILESDIR}"/${PN}-2.3.0-optional-hipblaslt.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-libcpp.patch
+	"${FILESDIR}"/${PN}-2.3.0-fix-gcc-clang-abi-compat.patch
 )
 
 src_prepare() {
@@ -235,11 +245,20 @@ src_configure() {
 		)
 	elif use rocm; then
 		export PYTORCH_ROCM_ARCH="$(get_amdgpu_flags)"
+		local use_hipblaslt="OFF"
+		if use amdgpu_targets_gfx90a || use amdgpu_targets_gfx940 || use amdgpu_targets_gfx941 \
+			|| use amdgpu_targets_gfx942; then
+			use_hipblaslt="ON"
+		fi
 
 		mycmakeargs+=(
 			-DUSE_NCCL=ON
 			-DUSE_SYSTEM_NCCL=ON
+			-DUSE_HIPBLASLT=${use_hipblaslt}
 		)
+
+		# ROCm libraries produce too much warnings
+		append-cxxflags -Wno-deprecated-declarations -Wno-unused-result
 	fi
 
 	if use onednn; then
diff --git a/sci-libs/caffe2/files/caffe2-2.3.0-exclude-aotriton.patch b/sci-libs/caffe2/files/caffe2-2.3.0-exclude-aotriton.patch
new file mode 100644
index 000000000000..2c65987acd85
--- /dev/null
+++ b/sci-libs/caffe2/files/caffe2-2.3.0-exclude-aotriton.patch
@@ -0,0 +1,35 @@
+Disables aotriton download when both USE_FLASH_ATTENTION and USE_MEM_EFF_ATTENTION cmake flags are OFF
+Backports upstream PR to 2.3.0: https://github.com/pytorch/pytorch/pull/130197
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1334,7 +1334,9 @@ if(USE_ROCM)
+       message(STATUS "Disabling Kernel Assert for ROCm")
+     endif()
+ 
+-    include(${CMAKE_CURRENT_LIST_DIR}/External/aotriton.cmake)
++    if(USE_FLASH_ATTENTION)
++      include(${CMAKE_CURRENT_LIST_DIR}/External/aotriton.cmake)
++    endif()
+     if(USE_CUDA)
+       caffe2_update_option(USE_MEM_EFF_ATTENTION OFF)
+     endif()
+--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
++++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+@@ -21,7 +21,7 @@
+ #include <cmath>
+ #include <functional>
+ 
+-#if USE_ROCM
++#if defined(USE_ROCM) && defined(USE_FLASH_ATTENTION)
+ #include <aotriton/flash.h>
+ #endif
+ 
+@@ -186,7 +186,7 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
+   // Check that the gpu is capable of running flash attention
+   using sm80 = SMVersion<8, 0>;
+   using sm90 = SMVersion<9, 0>;
+-#if USE_ROCM
++#if defined(USE_ROCM) && defined(USE_FLASH_ATTENTION)
+   auto stream = at::cuda::getCurrentCUDAStream().stream();
+   if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
+       auto dprops = at::cuda::getCurrentDeviceProperties();
diff --git a/sci-libs/caffe2/files/caffe2-2.3.0-fix-gcc-clang-abi-compat.patch b/sci-libs/caffe2/files/caffe2-2.3.0-fix-gcc-clang-abi-compat.patch
new file mode 100644
index 000000000000..a6f981b7e054
--- /dev/null
+++ b/sci-libs/caffe2/files/caffe2-2.3.0-fix-gcc-clang-abi-compat.patch
@@ -0,0 +1,17 @@
+
+When gcc builds libtorch_cpu.so and hipcc (clang-18) build libtorch_hip.so,
+resulting binary fails in runtime due to different mangling.
+Related issue in LLVM: https://github.com/llvm/llvm-project/issues/85656
+Fixed in pytorch-2.4.0 in https://github.com/pytorch/pytorch/commit/a89f442f0b103fa6f38103784a2dfedbd147f863
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1314,6 +1314,9 @@ if(USE_ROCM)
+        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
+     endif(CMAKE_BUILD_TYPE MATCHES Debug)
+ 
++    # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
++    list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
++
+     set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
+     # Ask hcc to generate device code during compilation so we can use
+     # host linker to link.
diff --git a/sci-libs/caffe2/files/caffe2-2.3.0-fix-libcpp.patch b/sci-libs/caffe2/files/caffe2-2.3.0-fix-libcpp.patch
new file mode 100644
index 000000000000..75808fd7ec50
--- /dev/null
+++ b/sci-libs/caffe2/files/caffe2-2.3.0-fix-libcpp.patch
@@ -0,0 +1,24 @@
+Workaround for libc++ issue https://github.com/llvm/llvm-project/issues/100802
+"reference to __host__ function 'memcpy' in __device__ function"
+--- a/c10/util/Half.h
++++ b/c10/util/Half.h
+@@ -227,7 +227,7 @@ C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
+   // const float exp_scale = 0x1.0p-112f;
+   constexpr uint32_t scale_bits = (uint32_t)15 << 23;
+   float exp_scale_val = 0;
+-  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
++  memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+   const float exp_scale = exp_scale_val;
+   const float normalized_value =
+       fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+@@ -298,8 +298,8 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) {
+   constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
+   constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
+   float scale_to_inf_val = 0, scale_to_zero_val = 0;
+-  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+-  std::memcpy(
++  memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
++  memcpy(
+       &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+   const float scale_to_inf = scale_to_inf_val;
+   const float scale_to_zero = scale_to_zero_val;
diff --git a/sci-libs/caffe2/files/caffe2-2.3.0-fix-rocm-gcc14-clamp.patch b/sci-libs/caffe2/files/caffe2-2.3.0-fix-rocm-gcc14-clamp.patch
new file mode 100644
index 000000000000..81ae075c67cc
--- /dev/null
+++ b/sci-libs/caffe2/files/caffe2-2.3.0-fix-rocm-gcc14-clamp.patch
@@ -0,0 +1,18 @@
+Fix hip compilation with gcc-14
+Upstream commit: https://github.com/pytorch/pytorch/commit/8c2c3a03fb87c3568a22362d83b00d82b9fb3db2
+--- a/aten/src/ATen/native/cuda/IndexKernel.cu
++++ b/aten/src/ATen/native/cuda/IndexKernel.cu
+@@ -259,7 +259,13 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind
+ 
+     gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
+       int64_t qvalue = static_cast<int64_t>(zero_point + nearbyintf(*(float*)in_data * inv_scale));
++      // See https://github.com/pytorch/pytorch/issues/127666
++      // hip-clang std::clamp __glibcxx_assert_fail host function when building on Fedora40/gcc14
++#ifndef USE_ROCM
+       qvalue = std::clamp(qvalue, qmin, qmax);
++#else
++      qvalue = (qvalue < qmin) ? qmin : (qmax < qvalue) ? qmax : qvalue;
++#endif
+       *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
+     });
+   });
diff --git a/sci-libs/caffe2/files/caffe2-2.3.0-optional-hipblaslt.patch b/sci-libs/caffe2/files/caffe2-2.3.0-optional-hipblaslt.patch
new file mode 100644
index 000000000000..dc544255c2bd
--- /dev/null
+++ b/sci-libs/caffe2/files/caffe2-2.3.0-optional-hipblaslt.patch
@@ -0,0 +1,235 @@
+Makes hipblaslt optional to simplify build for non-datacenter GPUs.
+Based on https://github.com/pytorch/pytorch/pull/120551 with added USE_HIPBLASLT cmake option.
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -225,6 +225,9 @@ option(USE_FAKELOWP "Use FakeLowp operators" OFF)
+ option(USE_FFMPEG "Use ffmpeg" OFF)
+ option(USE_GFLAGS "Use GFLAGS" OFF)
+ option(USE_GLOG "Use GLOG" OFF)
++cmake_dependent_option(
++    USE_HIPBLASLT "Use hipBLASLt" ON
++    "USE_ROCM" OFF)
+ option(USE_LEVELDB "Use LEVELDB" OFF)
+ option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+ option(USE_LMDB "Use LMDB" OFF)
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -14,7 +14,7 @@
+ #include <c10/util/irange.h>
+ 
+ #ifdef USE_ROCM
+-#if ROCM_VERSION >= 60000
++#ifdef USE_HIPBLASLT
+ #include <hipblaslt/hipblaslt-ext.hpp>
+ #endif
+ // until hipblas has an API to accept flags, we must use rocblas here
+@@ -781,7 +781,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+   }
+ }
+ 
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ 
+ #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
+ // only for rocm 5.7 where we first supported hipblaslt, it was difficult
+@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+ };
+ } // namespace
+ 
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename Dtype>
+ void gemm_and_bias(
+     bool transpose_mat1,
+@@ -1124,7 +1125,7 @@ template void gemm_and_bias(
+     at::BFloat16* result_ptr,
+     int64_t result_ld,
+     GEMMAndBiasActivationEpilogue activation);
+-
++#endif
+ void scaled_gemm(
+     char transa,
+     char transb,
+--- a/aten/src/ATen/cuda/CUDABlas.h
++++ b/aten/src/ATen/cuda/CUDABlas.h
+@@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+ template <>
+ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+ 
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ enum GEMMAndBiasActivationEpilogue {
+   None,
+   RELU,
+--- a/aten/src/ATen/cuda/CUDAContextLight.h
++++ b/aten/src/ATen/cuda/CUDAContextLight.h
+@@ -9,7 +9,7 @@
+ 
+ // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+ // added bf16 support
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ #include <cublasLt.h>
+ #endif
+ 
+@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
+ /* Handles */
+ TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
+ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
+ #endif
+ 
+--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
++++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
+@@ -29,7 +29,7 @@ namespace at::cuda {
+ 
+ namespace {
+ 
+-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+ void createCublasLtHandle(cublasLtHandle_t *handle) {
+   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
+ }
+@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
+   return handle;
+ }
+ 
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+ #ifdef USE_ROCM
+   c10::DeviceIndex device = 0;
+--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
++++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
+@@ -11,7 +11,7 @@
+ 
+ #include <ATen/cuda/tunable/GemmCommon.h>
+ #ifdef USE_ROCM
+-#if ROCM_VERSION >= 50700
++#ifdef USE_HIPBLASLT
+ #include <ATen/cuda/tunable/GemmHipblaslt.h>
+ #endif
+ #include <ATen/cuda/tunable/GemmRocblas.h>
+@@ -166,7 +166,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+     }
+ #endif
+ 
+-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+     static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env == nullptr || strcmp(env, "1") == 0) {
+       // disallow tuning of hipblaslt with c10::complex
+@@ -240,7 +240,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+     }
+ #endif
+ 
+-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+     static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env == nullptr || strcmp(env, "1") == 0) {
+       // disallow tuning of hipblaslt with c10::complex
+--- a/aten/src/ATen/native/cuda/Blas.cpp
++++ b/aten/src/ATen/native/cuda/Blas.cpp
+@@ -155,7 +155,7 @@ enum class Activation {
+   GELU,
+ };
+ 
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
+   switch (a) {
+     case Activation::None:
+@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() {
+ 
+ #ifdef USE_ROCM
+ static bool isSupportedHipLtROCmArch(int index) {
++#if defined(USE_HIPBLASLT)
+     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
+     std::string device_arch = prop->gcnArchName;
+     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
+@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) {
+         }
+     }
+     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
++#endif
+     return false;
+ }
+ #endif
+@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+   at::ScalarType scalar_type = self.scalar_type();
+   c10::MaybeOwned<Tensor> self_;
+   if (&result != &self) {
+-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
++#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && defined(USE_HIPBLASLT)
+     // Strangely, if mat2 has only 1 row or column, we get
+     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+     }
+     self__sizes = self_->sizes();
+   } else {
+-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+     useLtInterface = !disable_addmm_cuda_lt &&
+         result.dim() == 2 && result.is_contiguous() &&
+         isSupportedHipLtROCmArch(self.device().index()) &&
+@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+ 
+   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
+ 
+-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
++#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (useLtInterface) {
+     AT_DISPATCH_FLOATING_TYPES_AND2(
+         at::ScalarType::Half,
+@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+   at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+   at::native::resize_output(amax, {});
+ 
+-#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
++#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   cublasCommonArgs args(mat1, mat2, out);
+   const auto out_dtype_ = args.result->scalar_type();
+   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
+@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+   TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
+ #endif
+ 
+-#if defined(USE_ROCM) && ROCM_VERSION >= 60000
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+   // rocm's hipblaslt does not yet support amax, so calculate separately
+   auto out_float32 = out.to(kFloat);
+   out_float32.abs_();
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1282,6 +1282,9 @@ if(USE_ROCM)
+     if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
+       list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
+     endif()
++    if(hipblast_FOUND)
++      list(APPEND HIP_CXX_FLAGS -DHIPBLASLT)
++    endif()
+     if(HIPBLASLT_CUSTOM_DATA_TYPE)
+       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE)
+     endif()
+--- a/cmake/public/LoadHIP.cmake
++++ b/cmake/public/LoadHIP.cmake
+@@ -155,7 +155,7 @@ if(HIP_FOUND)
+   find_package_and_print_version(hiprand REQUIRED)
+   find_package_and_print_version(rocblas REQUIRED)
+   find_package_and_print_version(hipblas REQUIRED)
+-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
++  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0" AND USE_HIPBLASLT)
+     find_package_and_print_version(hipblaslt REQUIRED)
+   endif()
+   find_package_and_print_version(miopen REQUIRED)
+@@ -191,7 +191,7 @@ if(HIP_FOUND)
+   # roctx is part of roctracer
+   find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
+ 
+-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
++  if(hipblastlt_FOUND)
+     # check whether hipblaslt is using its own datatype
+     set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc")
+     file(WRITE ${file} ""
author	V3n3RiX <venerix@koprulu.sector>	2024-08-07 12:37:21 +0100
committer	V3n3RiX <venerix@koprulu.sector>	2024-08-07 12:37:21 +0100
commit	b8c7370a682e4e29cda623222d17a790c01c3642 (patch)
tree	f6caa14689bd00a5760eadaa381ff41e50ef3c1b /sci-libs/caffe2
parent	8a4997a7e2d1e36c089d4d76935b5a902d98d3d0 (diff)