@article{54548,
  author       = {{Prager, Raphael Patrick and Trautmann, Heike}},
  journal      = {{IEEE Transactions on Evolutionary Computation}},
  keywords     = {{Optimization, Evolutionary computation, Benchmark testing, Hyperparameter optimization, Portfolios, Extraterrestrial measurements, Dispersion, Exploratory landscape analysis, mixed-variable problem, mixed search spaces, automated algorithm selection}},
  pages        = {{1--1}},
  title        = {{{Exploratory Landscape Analysis for Mixed-Variable Problems}}},
  doi          = {{10.1109/TEVC.2024.3399560}},
  year         = {{2024}},
}

@article{56221,
  author       = {{Rodriguez-Fernandez, Angel E. and Schäpermeier, Lennart and Hernández, Carlos and Kerschke, Pascal and Trautmann, Heike and Schütze, Oliver}},
  journal      = {{IEEE Transactions on Evolutionary Computation}},
  keywords     = {{Optimization, Evolutionary computation, Approximation algorithms, Benchmark testing, Vectors, Surveys, Pareto optimization, multi-objective optimization, evolutionary computation, multimodal optimization, local solutions}},
  pages        = {{1--1}},
  title        = {{{Finding ϵ-Locally Optimal Solutions for Multi-Objective Multimodal Optimization}}},
  doi          = {{10.1109/TEVC.2024.3458855}},
  year         = {{2024}},
}

@inbook{56581,
  abstract     = {{In recent years, there has been a surge in natural language processing research focused on low-resource languages (LrLs), underscoring the growing recognition that LrLs deserve the same attention as high-resource languages (HrLs). This shift is crucial for ensuring linguistic diversity and inclusivity in the digital age. Despite Indonesian ranking as the 11th most spoken language globally, it remains under-resourced in terms of computational tools and datasets. Within the semantic web domain, Entity Linking (EL) is pivotal, linking textual entity mentions to their corresponding entries in knowledge bases. This process is foundational for advanced information extraction tasks, including relation extraction and event detection. To bolster EL research in Indonesian, we introduce IndEL, the first benchmark dataset tailored for both general and specific domains. IndEL was manually curated using Wikidata, adhering to a rigorous set of annotation guidelines. We used two Named Entity Recognition (NER) benchmark datasets for entity extraction: NER UI for the general domain and IndQNER for the specific domain. IndQNER focused on entities from the Indonesian translation of the Quran. IndEL comprises 4765 entities in the general domain and 2453 in the specific domain. Using the GERBIL framework, we use IndEL to evaluate the performance of various EL systems, such as Babelfy, DBpedia Spotlight, MAG, OpenTapioca, and WAT. Our further investigation reveals that within Wikidata, a significant number of NIL entities remain unlinked due to the limited number of Indonesian labels and the use of acronyms. Especially in the specific domain, transliteration and translation processes performed to create the Indonesian translation of the Quran contribute to the presence of entities in a descriptive form and as synonyms.}},
  author       = {{Gusmita, Ria Hari and Abshar, Muhammad Faruq Amiral and Moussallem, Diego and Ngonga Ngomo, Axel-Cyrille}},
  booktitle    = {{Lecture Notes in Computer Science}},
  isbn         = {{9783031702389}},
  issn         = {{0302-9743}},
  keywords     = {{entity linking benchmark dataset, Indonesian, general and specific domains}},
  location     = {{Turin, Italy}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{IndEL: Indonesian Entity Linking Benchmark Dataset for General and Specific Domains}}},
  doi          = {{10.1007/978-3-031-70239-6_34}},
  year         = {{2024}},
}

@inbook{46572,
  abstract     = {{Indonesian is classified as underrepresented in the Natural Language Processing (NLP) field, despite being the tenth most spoken language in the world with 198 million speakers. The paucity of datasets is recognized as the main reason for the slow advancements in NLP research for underrepresented languages. Significant attempts were made in 2020 to address this drawback for Indonesian. The Indonesian Natural Language Understanding (IndoNLU) benchmark was introduced alongside IndoBERT pre-trained language model. The second benchmark, Indonesian Language Evaluation Montage (IndoLEM), was presented in the same year. These benchmarks support several tasks, including Named Entity Recognition (NER). However, all NER datasets are in the public domain and do not contain domain-specific datasets. To alleviate this drawback, we introduce IndQNER, a manually annotated NER benchmark dataset in the religious domain that adheres to a meticulously designed annotation guideline. Since Indonesia has the world’s largest Muslim population, we build the dataset from the Indonesian translation of the Quran. The dataset includes 2475 named entities representing 18 different classes. To assess the annotation quality of IndQNER, we perform experiments with BiLSTM and CRF-based NER, as well as IndoBERT fine-tuning. The results reveal that the first model outperforms the second model achieving 0.98 F1 points. This outcome indicates that IndQNER may be an acceptable evaluation metric for Indonesian NER tasks in the aforementioned domain, widening the research’s domain range.}},
  author       = {{Gusmita, Ria Hari and Firmansyah, Asep Fajar and Moussallem, Diego and Ngonga Ngomo, Axel-Cyrille}},
  booktitle    = {{Natural Language Processing and Information Systems}},
  isbn         = {{9783031353192}},
  issn         = {{0302-9743}},
  keywords     = {{NER benchmark dataset, Indonesian, specific domain}},
  location     = {{Derby, UK}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{IndQNER: Named Entity Recognition Benchmark Dataset from the Indonesian Translation of the Quran}}},
  doi          = {{10.1007/978-3-031-35320-8_12}},
  year         = {{2023}},
}

@misc{32409,
  abstract     = {{Context: Cryptographic APIs are often misused in real-world applications. Therefore, many cryptographic API misuse detection tools have been introduced. However, there exists no established reference benchmark for a fair and comprehensive comparison and evaluation of these tools. While there are benchmarks, they often only address a subset of the domain or were only used to evaluate a subset of existing misuse detection tools. Objective: To fairly compare cryptographic API misuse detection tools and to drive future development in this domain, we will devise such a benchmark. Openness and transparency in the generation process are key factors to fairly generate and establish the needed benchmark. Method: We propose an approach where we derive the benchmark generation methodology from the literature which consists of general best practices in benchmarking and domain-specific benchmark generation. A part of this methodology is transparency and openness of the generation process, which is achieved by pre-registering this work. Based on our methodology we design CamBench, a fair "Cryptographic API Misuse Detection Tool Benchmark Suite". We will implement the first version of CamBench limiting the domain to Java, the JCA, and static analyses. Finally, we will use CamBench to compare current misuse detection tools and compare CamBench to related benchmarks of its domain.}},
  author       = {{Schlichtig, Michael and Wickert, Anna-Katharina and Krüger, Stefan and Bodden, Eric and Mezini, Mira}},
  keywords     = {{cryptography, benchmark, API misuse, static analysis}},
  title        = {{{CamBench -- Cryptographic API Misuse Detection Tool Benchmark Suite}}},
  doi          = {{10.48550/ARXIV.2204.06447}},
  year         = {{2022}},
}

@inproceedings{15838,
  abstract     = {{In the field of software analysis a trade-off between scalability and accuracy always exists. In this respect, Android app analysis is no exception, in particular, analyzing large or many apps can be challenging. Dealing with many small apps is a typical challenge when facing micro-benchmarks such as DROIDBENCH or ICC-BENCH. These particular benchmarks are not only used for the evaluation of novel tools but also in continuous integration pipelines of existing mature tools to maintain and guarantee a certain quality-level. Considering this latter usage it becomes very important to be able to achieve benchmark results as fast as possible. Hence, benchmarks have to be optimized for this purpose. One approach to do so is app merging. We implemented the Android Merge Tool (AMT) following this approach and show that its novel aspects can be used to produce scaled up and accurate benchmarks. For such benchmarks Android app analysis tools do not suffer from the scalability-accuracy trade-off anymore. We show this throughout detailed experiments on DROIDBENCH employing three different analysis tools (AMANDROID, ICCTA, FLOWDROID). Benchmark execution times are largely reduced without losing benchmark accuracy. Moreover, we argue why AMT is an advantageous successor of the state-of-the-art app merging tool (APKCOMBINER) in analysis lift-up scenarios.}},
  author       = {{Pauck, Felix and Zhang, Shikun}},
  booktitle    = {{2019 34th IEEE/ACM International Conference on Automated Software Engineering Workshop (ASEW)}},
  isbn         = {{9781728141367}},
  keywords     = {{Program Analysis, Android App Analysis, Taint Analysis, App Merging, Benchmark}},
  title        = {{{Android App Merging for Benchmark Speed-Up and Analysis Lift-Up}}},
  doi          = {{10.1109/asew.2019.00019}},
  year         = {{2019}},
}

@inbook{56579,
  abstract     = {{Question answering engines have become one of the most popular type of applications driven by Semantic Web technologies. Consequently, the provision of means to quantify the performance of current question answering approaches on current datasets has become ever more important. However, a large percentage of the queries found in popular question answering benchmarks cannot be executed on current versions of their reference dataset. There is a consequently a clear need to curate question answering benchmarks periodically. However, the manual alteration of question answering benchmarks is often error-prone. We alleviate this problem by presenting QUANT, a novel framework for the creation and curation of question answering benchmarks. QUANT sup-ports the curation of benchmarks by generating smart edit suggestions for question-query pair and for the corresponding metadata. In addition, our framework supports the creation of new benchmark entries by pro-viding predefined quality checks for queries. We evaluate QUANT on 653questions obtained from QALD-1 to QALD-8 with 10 users. Our results show that our framework generates reliable suggestions and can reduce the curation effort for QA benchmarks by up to 91%.}},
  author       = {{Gusmita, Ria Hari and Jalota, Rricha and Vollmers, Daniel and Reineke, Jan and Ngonga Ngomo, Axel-Cyrille and Usbeck, Ricardo}},
  booktitle    = {{Semantic Systems. The Power of AI and Knowledge Graphs}},
  editor       = {{Acosta, Maribel and Cudr{\'e}-Mauroux, Philippe and Maleshkova, Maria and Pellegrini, Tassilo and Sack, Harald and Sure-Vetter, York}},
  isbn         = {{978-3-030-33219-8}},
  issn         = {{0302-9743}},
  keywords     = {{Benchmark, Question answering, Knowledge base}},
  location     = {{Karlsruhe, Germany}},
  pages        = {{343----358}},
  publisher    = {{Springer International Publishing}},
  title        = {{{QUANT - Question Answering Benchmark Curator}}},
  doi          = {{10.1007/978-3-030-33220-4_25}},
  year         = {{2019}},
}

@inproceedings{2432,
  abstract     = {{In this paper, we present the analysis of applications from the domain of handheld and wearable computing. This analysis is the first step to derive and evaluate design parameters for dynamically reconfigurable processors. We discuss the selection of representative benchmarks for handhelds and wearables and group the applications into multimedia, communications, and cryptography programs. We simulate the applications on a cycle-accurate processor simulator and gather statistical data such as instruction mix, cache hit rates and memory requirements for an embedded processor model. A breakdown of the executed cycles into different functions identifies the most compute-intensive code sections - the kernels. Then, we analyze the applications and discuss parameters that strongly influence the design of dynamically reconfigurable processors. Finally, we outline the construction of a parameterizable simulation model for a reconfigurable unit that is attached to a processor core.}},
  author       = {{Enzler, Rolf and Platzner, Marco and Plessl, Christian and Thiele, Lothar and Tröster, Gerhard}},
  booktitle    = {{Reconfigurable Technology: FPGAs and Reconfigurable Processors for Computing and Communications III}},
  keywords     = {{benchmark}},
  pages        = {{135--146}},
  title        = {{{Reconfigurable Processors for Handhelds and Wearables: Application Analysis}}},
  doi          = {{10.1117/12.434376}},
  volume       = {{4525}},
  year         = {{2001}},
}

