4 files changed, 290 insertions, 0 deletions
diff --git a/sci-ml/evaluate/Manifest b/sci-ml/evaluate/Manifest
new file mode 100644
index 000000000000..672fda32794c
--- /dev/null
+++ b/sci-ml/evaluate/Manifest
@@ -0,0 +1,4 @@
+AUX evaluate-0.4.0-tests.patch 8354 BLAKE2B 45c6fbcd58eb1899a02b373f882917cb2d76a05f5737a50c60d96f1eeb9b0169f8e526a475ec8aab2829f3ce47082476b802813064ca912353b65652e3f7fc64 SHA512 e296c97d5bf3d06828ddda96f00b20cb763b9c9a8611b148cf0e5c55bb78995a45df1b2cd731e4c45bfe80817ce03485a958c01c2e081aef3cd5238a9011903b
+DIST evaluate-0.4.3.gh.tar.gz 297000 BLAKE2B 5d658cf513a0083194829618bde53c1e6f4a41e48a9ff683c79bc5b57f9f25acc38b42bed5ad56c569b7cae442212fc3dda1ce4db406ab2a49789b485c5c27a0 SHA512 f19f22fb1625544d3e27f25b159449c9f656540b5dbc62f00d13bad2dd33d5591d80f6e81d671d51f802b671602b86b758d9a53be4f63378c181f66eb93c2830
+EBUILD evaluate-0.4.3.ebuild 1656 BLAKE2B 45fa7d1e64a4f699963b72a3f30a493fa932f44b7a110f0f348f2d2d96e8942812858f678d8890f14027a252bdfd64a08c8da4c47bca28055c83f7f0cf8c049b SHA512 f048e71c4ad6537c7d315b6370c6187076a76971b16c4a86a034eb0a91f7746255b98848ef49e9811c967840d51e7cde32f02a2ee25ed46fa4c76d509c4b5408
+MISC metadata.xml 379 BLAKE2B a717b46962e59358925c866c64b2d0bc1dcd5d55d73e814686a09f703e339d2c0de52f6c214c8f795518d6d9dbb46443be11374643d415ff681dedca1511732b SHA512 03d6b58cad73cad46f1101fedf88ec94dc6d7a2028399a20b39939bead4fa402d00224085206a175a33d92417176cc45853060b18faa13769b80527fac9254e1
diff --git a/sci-ml/evaluate/evaluate-0.4.3.ebuild b/sci-ml/evaluate/evaluate-0.4.3.ebuild
new file mode 100644
index 000000000000..5ca6b9f320e2
--- /dev/null
+++ b/sci-ml/evaluate/evaluate-0.4.3.ebuild
@@ -0,0 +1,54 @@
+# Copyright 2023-2025 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=8
+
+DISTUTILS_USE_PEP517=setuptools
+PYTHON_COMPAT=( python3_{11..12} )
+DISTUTILS_SINGLE_IMPL=1
+inherit distutils-r1
+
+DESCRIPTION="makes evaluating, comparing models and reporting their performance easier"
+HOMEPAGE="
+	https://pypi.org/project/evaluate/
+	https://github.com/huggingface/evaluate
+"
+SRC_URI="https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
+	-> ${P}.gh.tar.gz"
+
+LICENSE="Apache-2.0"
+SLOT="0"
+KEYWORDS="~amd64"
+
+RDEPEND="
+	$(python_gen_cond_dep '
+		dev-python/matplotlib[${PYTHON_USEDEP}]
+		dev-python/pyarrow[${PYTHON_USEDEP},parquet]
+		dev-python/unidecode[${PYTHON_USEDEP}]
+	')
+	sci-ml/datasets[${PYTHON_SINGLE_USEDEP}]
+	sci-ml/transformers[${PYTHON_SINGLE_USEDEP}]
+"
+BDEPEND="test? (
+	$(python_gen_cond_dep '
+		sci-libs/jiwer[${PYTHON_USEDEP}]
+		sci-libs/seqeval[${PYTHON_USEDEP}]
+	')
+)"
+
+PATCHES=( "${FILESDIR}"/${PN}-0.4.0-tests.patch )
+
+distutils_enable_tests pytest
+
+src_test() {
+	local EPYTEST_DESELECT=(
+		tests/test_evaluation_suite.py::TestEvaluationSuite::test_empty_suite
+		tests/test_evaluation_suite.py::TestEvaluationSuite::test_running_evaluation_suite
+		tests/test_evaluator.py::TestAudioClassificationEvaluator::test_class_init
+		tests/test_evaluator.py::TestAudioClassificationEvaluator::test_overwrite_default_metric
+		tests/test_evaluator.py::TestAudioClassificationEvaluator::test_pipe_init
+		tests/test_evaluator.py::TestAudioClassificationEvaluator::test_raw_pipe_init
+		tests/test_metric.py::TestEvaluationcombined_evaluation::test_modules_from_string_poslabel
+	)
+	distutils-r1_src_test
+}
diff --git a/sci-ml/evaluate/files/evaluate-0.4.0-tests.patch b/sci-ml/evaluate/files/evaluate-0.4.0-tests.patch
new file mode 100644
index 000000000000..cc0a8b6a7eed
--- /dev/null
+++ b/sci-ml/evaluate/files/evaluate-0.4.0-tests.patch
@@ -0,0 +1,220 @@
+--- a/tests/test_evaluator.py	2023-05-14 11:01:54.449768849 +0200
++++ b/tests/test_evaluator.py	2023-05-14 11:06:15.182738125 +0200
+@@ -16,6 +16,7 @@
+ 
+ from time import sleep
+ from unittest import TestCase, mock
++from unittest import skip
+ 
+ from datasets import ClassLabel, Dataset, Features, Sequence, Value
+ from PIL import Image
+@@ -128,6 +128,7 @@
+         return [{"text": "Lorem ipsum"} for _ in inputs]
+ 
+ 
++@skip("require network")
+ class TestEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
+@@ -230,6 +230,7 @@
+         )
+ 
+ 
++@skip("require network")
+ class TestTextClassificationEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
+@@ -394,6 +394,7 @@
+         self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(data), 5)
+ 
+ 
++@skip("require network")
+ class TestTextClassificationEvaluatorTwoColumns(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict(
+@@ -452,6 +452,7 @@
+         self.assertEqual(results["accuracy"], 1.0)
+ 
+ 
++@skip("require network")
+ class TestImageClassificationEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict(
+@@ -534,6 +535,7 @@
+         self.assertEqual(results["accuracy"], 0)
+ 
+ 
++@skip("require network")
+ class TestQuestionAnsweringEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict(
+@@ -716,6 +716,7 @@
+         )
+         self.assertEqual(results["overall_accuracy"], 0.5)
+ 
++    @skip("require network")
+     def test_class_init(self):
+         evaluator = TokenClassificationEvaluator()
+         self.assertEqual(evaluator.task, "token-classification")
+@@ -735,6 +736,7 @@
+         )
+         self.assertEqual(results["overall_accuracy"], 2 / 3)
+ 
++    @skip("require network")
+     def test_overwrite_default_metric(self):
+         accuracy = load("seqeval")
+         results = self.evaluator.compute(
+@@ -750,6 +752,7 @@
+         )
+         self.assertEqual(results["overall_accuracy"], 1.0)
+ 
++    @skip("require network")
+     def test_data_loading(self):
+         # Test passing in dataset by name with data_split
+         data = self.evaluator.load_data("evaluate/conll2003-ci", split="validation[:1]")
+@@ -863,6 +866,7 @@
+         self.pipe = DummyTextGenerationPipeline(num_return_sequences=4)
+         self.evaluator = evaluator("text-generation")
+ 
++    @skip("require network")
+     def test_class_init(self):
+         evaluator = TextGenerationEvaluator()
+         self.assertEqual(evaluator.task, "text-generation")
+@@ -877,6 +877,7 @@
+         results = self.evaluator.compute(data=self.data)
+         self.assertIsInstance(results["unique_words"], int)
+ 
++    @skip("require nltk")
+     def test_overwrite_default_metric(self):
+         word_length = load("word_length")
+         results = self.evaluator.compute(
+@@ -906,6 +910,7 @@
+         self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]})
+ 
+ 
++@skip("require network")
+ class TestText2TextGenerationEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict(
+@@ -979,6 +984,7 @@
+         self.assertEqual(results["bleu"], 0)
+ 
+ 
++@skip("require network")
+ class TestAutomaticSpeechRecognitionEvaluator(TestCase):
+     def setUp(self):
+         self.data = Dataset.from_dict(
+--- a/tests/test_trainer_evaluator_parity.py	2023-05-14 17:50:29.224525549 +0200
++++ b/tests/test_trainer_evaluator_parity.py	2023-05-14 17:37:40.947501195 +0200
+@@ -269,6 +269,7 @@
+         self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"])
+         self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"])
+ 
++    @unittest.skip('require eval_results.json')
+     def test_token_classification_parity(self):
+         model_name = "hf-internal-testing/tiny-bert-for-token-classification"
+         n_samples = 500
+--- a/tests/test_load.py	2023-05-20 15:45:58.855473557 +0200
++++ b/tests/test_load.py	2023-05-20 15:50:41.620071500 +0200
+@@ -61,6 +61,7 @@
+             hf_modules_cache=self.hf_modules_cache,
+         )
+ 
++    @pytest.mark.skip("require network")
+     def test_HubEvaluationModuleFactory_with_internal_import(self):
+         # "squad_v2" requires additional imports (internal)
+         factory = HubEvaluationModuleFactory(
+@@ -72,6 +73,7 @@
+         module_factory_result = factory.get_module()
+         assert importlib.import_module(module_factory_result.module_path) is not None
+ 
++    @pytest.mark.skip("require network")
+     def test_HubEvaluationModuleFactory_with_external_import(self):
+         # "bleu" requires additional imports (external from github)
+         factory = HubEvaluationModuleFactory(
+@@ -83,6 +85,7 @@
+         module_factory_result = factory.get_module()
+         assert importlib.import_module(module_factory_result.module_path) is not None
+ 
++    @pytest.mark.skip("require network")
+     def test_HubEvaluationModuleFactoryWithScript(self):
+         factory = HubEvaluationModuleFactory(
+             SAMPLE_METRIC_IDENTIFIER,
+@@ -115,6 +118,7 @@
+                 module_factory_result = factory.get_module()
+                 assert importlib.import_module(module_factory_result.module_path) is not None
+ 
++    @pytest.mark.skip("require network")
+     def test_cache_with_remote_canonical_module(self):
+         metric = "accuracy"
+         evaluation_module_factory(
+@@ -127,6 +131,7 @@
+                     metric, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
+                 )
+ 
++    @pytest.mark.skip("require network")
+     def test_cache_with_remote_community_module(self):
+         metric = "lvwerra/test"
+         evaluation_module_factory(
+--- a/tests/test_metric.py	2023-05-20 15:54:32.558477445 +0200
++++ b/tests/test_metric.py	2023-05-20 15:55:40.775415987 +0200
+@@ -316,6 +316,7 @@
+             self.assertDictEqual(expected_results[1], results[1])
+             del results
+ 
++    @pytest.mark.skip('')
+     def test_distributed_metrics(self):
+         with tempfile.TemporaryDirectory() as tmp_dir:
+             (preds_0, refs_0), (preds_1, refs_1) = DummyMetric.distributed_predictions_and_references()
+@@ -736,6 +736,7 @@
+ 
+         self.assertDictEqual(dummy_result_1, combined_evaluation.compute(predictions=preds, references=refs))
+ 
++    @pytest.mark.skip('require network')
+     def test_modules_from_string(self):
+         expected_result = {"accuracy": 0.5, "recall": 0.5, "precision": 1.0}
+         predictions = [0, 1]
+--- a/tests/test_metric_common.py	2023-05-20 15:57:02.399146066 +0200
++++ b/tests/test_metric_common.py	2023-05-20 15:59:25.167947472 +0200
+@@ -99,6 +99,7 @@
+     evaluation_module_name = None
+     evaluation_module_type = None
+ 
++    @pytest.mark.skip('require network')
+     def test_load(self, evaluation_module_name, evaluation_module_type):
+         doctest.ELLIPSIS_MARKER = "[...]"
+         evaluation_module = importlib.import_module(
+--- a/tests/test_trainer_evaluator_parity.py	2023-05-20 16:00:55.986549706 +0200
++++ b/tests/test_trainer_evaluator_parity.py	2023-05-20 16:02:51.808766855 +0200
+@@ -4,6 +4,7 @@
+ import subprocess
+ import tempfile
+ import unittest
++import pytest
+ 
+ import numpy as np
+ import torch
+@@ -33,6 +33,7 @@
+     def tearDown(self):
+         shutil.rmtree(self.dir_path, ignore_errors=True)
+ 
++    @pytest.mark.skip('require network')
+     def test_text_classification_parity(self):
+         model_name = "philschmid/tiny-bert-sst2-distilled"
+ 
+@@ -121,6 +122,7 @@
+ 
+         self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
+ 
++    @pytest.mark.skip('require network')
+     def test_image_classification_parity(self):
+         # we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
+         model_name = "douwekiela/resnet-18-finetuned-dogfood"
+@@ -179,6 +181,7 @@
+ 
+         self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])
+ 
++    @pytest.mark.skip('require network')
+     def test_question_answering_parity(self):
+         model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad"
+         model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2"
diff --git a/sci-ml/evaluate/metadata.xml b/sci-ml/evaluate/metadata.xml
new file mode 100644
index 000000000000..f1e8571190f9
--- /dev/null
+++ b/sci-ml/evaluate/metadata.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE pkgmetadata SYSTEM "https://www.gentoo.org/dtd/metadata.dtd">
+<pkgmetadata>
+	<maintainer type="person">
+		<email>tupone@gentoo.org</email>
+		<name>Tupone Alfredo</name>
+	</maintainer>
+	<upstream>
+		<remote-id type="pypi">evaluate</remote-id>
+		<remote-id type="github">huggingface/evaluate</remote-id>
+	</upstream>
+</pkgmetadata>