@article{32854, author = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}}, journal = {{IFAC-PapersOnLine}}, number = {{13}}, pages = {{133–138}}, publisher = {{Elsevier}}, title = {{{Practical Network Conditions for the Convergence of Distributed Optimization}}}, volume = {{55}}, year = {{2022}}, } @inproceedings{29220, abstract = {{Modern services often comprise several components, such as chained virtual network functions, microservices, or machine learning functions. Providing such services requires to decide how often to instantiate each component, where to place these instances in the network, how to chain them and route traffic through them. To overcome limitations of conventional, hardwired heuristics, deep reinforcement learning (DRL) approaches for self-learning network and service management have emerged recently. These model-free DRL approaches are more flexible but typically learn tabula rasa, i.e., disregard existing understanding of networks, services, and their coordination. Instead, we propose FutureCoord, a novel model-based AI approach that leverages existing understanding of networks and services for more efficient and effective coordination without time-intensive training. FutureCoord combines Monte Carlo Tree Search with a stochastic traffic model. This allows FutureCoord to estimate the impact of future incoming traffic and effectively optimize long-term effects, taking fluctuating demand and Quality of Service (QoS) requirements into account. Our extensive evaluation based on real-world network topologies, services, and traffic traces indicates that FutureCoord clearly outperforms state-of-the-art model-free and model-based approaches with up to 51% higher flow success ratios.}}, author = {{Werner, Stefan and Schneider, Stefan Balthasar and Karl, Holger}}, booktitle = {{IEEE/IFIP Network Operations and Management Symposium (NOMS)}}, keywords = {{network management, service management, AI, Monte Carlo Tree Search, model-based, QoS}}, location = {{Budapest}}, publisher = {{IEEE}}, title = {{{Use What You Know: Network and Service Coordination Beyond Certainty}}}, year = {{2022}}, } @inproceedings{20125, abstract = {{Datacenter applications have different resource requirements from network and developing flow scheduling heuristics for every workload is practically infeasible. In this paper, we show that deep reinforcement learning (RL) can be used to efficiently learn flow scheduling policies for different workloads without manual feature engineering. Specifically, we present LFS, which learns to optimize a high-level performance objective, e.g., maximize the number of flow admissions while meeting the deadlines. The LFS scheduler is trained through deep RL to learn a scheduling policy on continuous online flow arrivals. The evaluation results show that the trained LFS scheduler admits 1.05x more flows than the greedy flow scheduling heuristics under varying network load.}}, author = {{Hasnain, Asif and Karl, Holger}}, booktitle = {{2021 IEEE 18th Annual Consumer Communications & Networking Conference (CCNC)}}, keywords = {{Flow scheduling, Deadlines, Reinforcement learning}}, location = {{Las Vegas, USA}}, publisher = {{IEEE Computer Society}}, title = {{{Learning Flow Scheduling}}}, doi = {{https://doi.org/10.1109/CCNC49032.2021.9369514}}, year = {{2021}}, } @phdthesis{27503, author = {{Hasnain, Asif}}, title = {{{Automating Network Resource Allocation for Coflows with Deadlines}}}, doi = {{10.17619/UNIPB/1-1241 }}, year = {{2021}}, } @inproceedings{21005, abstract = {{Data-parallel applications are developed using different data programming models, e.g., MapReduce, partition/aggregate. These models represent diverse resource requirements of application in a datacenter network, which can be represented by the coflow abstraction. The conventional method of creating hand-crafted coflow heuristics for admission or scheduling for different workloads is practically infeasible. In this paper, we propose a deep reinforcement learning (DRL)-based coflow admission scheme -- LCS -- that can learn an admission policy for a higher-level performance objective, i.e., maximize successful coflow admissions, without manual feature engineering. LCS is trained on a production trace, which has online coflow arrivals. The evaluation results show that LCS is able to learn a reasonable admission policy that admits more coflows than state-of-the-art Varys heuristic while meeting their deadlines.}}, author = {{Hasnain, Asif and Karl, Holger}}, booktitle = {{IEEE INFOCOM 2021 - IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS)}}, keywords = {{Coflow scheduling, Reinforcement learning, Deadlines}}, location = {{Vancouver BC Canada}}, publisher = {{IEEE Communications Society}}, title = {{{Learning Coflow Admissions}}}, doi = {{10.1109/INFOCOMWKSHPS51825.2021.9484599}}, year = {{2021}}, } @inproceedings{21543, abstract = {{Services often consist of multiple chained components such as microservices in a service mesh, or machine learning functions in a pipeline. Providing these services requires online coordination including scaling the service, placing instance of all components in the network, scheduling traffic to these instances, and routing traffic through the network. Optimized service coordination is still a hard problem due to many influencing factors such as rapidly arriving user demands and limited node and link capacity. Existing approaches to solve the problem are often built on rigid models and assumptions, tailored to specific scenarios. If the scenario changes and the assumptions no longer hold, they easily break and require manual adjustments by experts. Novel self-learning approaches using deep reinforcement learning (DRL) are promising but still have limitations as they only address simplified versions of the problem and are typically centralized and thus do not scale to practical large-scale networks. To address these issues, we propose a distributed self-learning service coordination approach using DRL. After centralized training, we deploy a distributed DRL agent at each node in the network, making fast coordination decisions locally in parallel with the other nodes. Each agent only observes its direct neighbors and does not need global knowledge. Hence, our approach scales independently from the size of the network. In our extensive evaluation using real-world network topologies and traffic traces, we show that our proposed approach outperforms a state-of-the-art conventional heuristic as well as a centralized DRL approach (60% higher throughput on average) while requiring less time per online decision (1 ms).}}, author = {{Schneider, Stefan Balthasar and Qarawlus, Haydar and Karl, Holger}}, booktitle = {{IEEE International Conference on Distributed Computing Systems (ICDCS)}}, keywords = {{network management, service management, coordination, reinforcement learning, distributed}}, location = {{Washington, DC, USA}}, publisher = {{IEEE}}, title = {{{Distributed Online Service Coordination Using Deep Reinforcement Learning}}}, year = {{2021}}, } @inproceedings{20693, abstract = {{In practical, large-scale networks, services are requested by users across the globe, e.g., for video streaming. Services consist of multiple interconnected components such as microservices in a service mesh. Coordinating these services requires scaling them according to continuously changing user demand, deploying instances at the edge close to their users, and routing traffic efficiently between users and connected instances. Network and service coordination is commonly addressed through centralized approaches, where a single coordinator knows everything and coordinates the entire network globally. While such centralized approaches can reach global optima, they do not scale to large, realistic networks. In contrast, distributed approaches scale well, but sacrifice solution quality due to their limited scope of knowledge and coordination decisions. To this end, we propose a hierarchical coordination approach that combines the good solution quality of centralized approaches with the scalability of distributed approaches. In doing so, we divide the network into multiple hierarchical domains and optimize coordination in a top-down manner. We compare our hierarchical with a centralized approach in an extensive evaluation on a real-world network topology. Our results indicate that hierarchical coordination can find close-to-optimal solutions in a fraction of the runtime of centralized approaches.}}, author = {{Schneider, Stefan Balthasar and Jürgens, Mirko and Karl, Holger}}, booktitle = {{IFIP/IEEE International Symposium on Integrated Network Management (IM)}}, keywords = {{network management, service management, coordination, hierarchical, scalability, nfv}}, location = {{Bordeaux, France}}, publisher = {{IFIP/IEEE}}, title = {{{Divide and Conquer: Hierarchical Network and Service Coordination}}}, year = {{2021}}, } @article{21808, abstract = {{Modern services consist of interconnected components,e.g., microservices in a service mesh or machine learning functions in a pipeline. These services can scale and run across multiple network nodes on demand. To process incoming traffic, service components have to be instantiated and traffic assigned to these instances, taking capacities, changing demands, and Quality of Service (QoS) requirements into account. This challenge is usually solved with custom approaches designed by experts. While this typically works well for the considered scenario, the models often rely on unrealistic assumptions or on knowledge that is not available in practice (e.g., a priori knowledge). We propose DeepCoord, a novel deep reinforcement learning approach that learns how to best coordinate services and is geared towards realistic assumptions. It interacts with the network and relies on available, possibly delayed monitoring information. Rather than defining a complex model or an algorithm on how to achieve an objective, our model-free approach adapts to various objectives and traffic patterns. An agent is trained offline without expert knowledge and then applied online with minimal overhead. Compared to a state-of-the-art heuristic, DeepCoord significantly improves flow throughput (up to 76%) and overall network utility (more than 2x) on realworld network topologies and traffic traces. It also supports optimizing multiple, possibly competing objectives, learns to respect QoS requirements, generalizes to scenarios with unseen, stochastic traffic, and scales to large real-world networks. For reproducibility and reuse, our code is publicly available.}}, author = {{Schneider, Stefan Balthasar and Khalili, Ramin and Manzoor, Adnan and Qarawlus, Haydar and Schellenberg, Rafael and Karl, Holger and Hecker, Artur}}, journal = {{Transactions on Network and Service Management}}, keywords = {{network management, service management, coordination, reinforcement learning, self-learning, self-adaptation, multi-objective}}, publisher = {{IEEE}}, title = {{{Self-Learning Multi-Objective Service Coordination Using Deep Reinforcement Learning}}}, doi = {{10.1109/TNSM.2021.3076503}}, year = {{2021}}, } @techreport{33854, abstract = {{Macrodiversity is a key technique to increase the capacity of mobile networks. It can be realized using coordinated multipoint (CoMP), simultaneously connecting users to multiple overlapping cells. Selecting which users to serve by how many and which cells is NP-hard but needs to happen continuously in real time as users move and channel state changes. Existing approaches often require strict assumptions about or perfect knowledge of the underlying radio system, its resource allocation scheme, or user movements, none of which is readily available in practice. Instead, we propose three novel self-learning and self-adapting approaches using model-free deep reinforcement learning (DRL): DeepCoMP, DD-CoMP, and D3-CoMP. DeepCoMP leverages central observations and control of all users to select cells almost optimally. DD-CoMP and D3-CoMP use multi-agent DRL, which allows distributed, robust, and highly scalable coordination. All three approaches learn from experience and self-adapt to varying scenarios, reaching 2x higher Quality of Experience than other approaches. They have very few built-in assumptions and do not need prior system knowledge, making them more robust to change and better applicable in practice than existing approaches.}}, author = {{Schneider, Stefan Balthasar and Karl, Holger and Khalili, Ramin and Hecker, Artur}}, keywords = {{mobility management, coordinated multipoint, CoMP, cell selection, resource management, reinforcement learning, multi agent, MARL, self-learning, self-adaptation, QoE}}, title = {{{DeepCoMP: Coordinated Multipoint Using Multi-Agent Deep Reinforcement Learning}}}, year = {{2021}}, } @techreport{35889, abstract = {{Network and service coordination is important to provide modern services consisting of multiple interconnected components, e.g., in 5G, network function virtualization (NFV), or cloud and edge computing. In this paper, I outline my dissertation research, which proposes six approaches to automate such network and service coordination. All approaches dynamically react to the current demand and optimize coordination for high service quality and low costs. The approaches range from centralized to distributed methods and from conventional heuristic algorithms and mixed-integer linear programs to machine learning approaches using supervised and reinforcement learning. I briefly discuss their main ideas and advantages over other state-of-the-art approaches and compare strengths and weaknesses.}}, author = {{Schneider, Stefan Balthasar}}, keywords = {{nfv, coordination, machine learning, reinforcement learning, phd, digest}}, title = {{{Conventional and Machine Learning Approaches for Network and Service Coordination}}}, year = {{2021}}, } @inproceedings{19607, abstract = {{Modern services consist of modular, interconnected components, e.g., microservices forming a service mesh. To dynamically adjust to ever-changing service demands, service components have to be instantiated on nodes across the network. Incoming flows requesting a service then need to be routed through the deployed instances while considering node and link capacities. Ultimately, the goal is to maximize the successfully served flows and Quality of Service (QoS) through online service coordination. Current approaches for service coordination are usually centralized, assuming up-to-date global knowledge and making global decisions for all nodes in the network. Such global knowledge and centralized decisions are not realistic in practical large-scale networks. To solve this problem, we propose two algorithms for fully distributed service coordination. The proposed algorithms can be executed individually at each node in parallel and require only very limited global knowledge. We compare and evaluate both algorithms with a state-of-the-art centralized approach in extensive simulations on a large-scale, real-world network topology. Our results indicate that the two algorithms can compete with centralized approaches in terms of solution quality but require less global knowledge and are magnitudes faster (more than 100x).}}, author = {{Schneider, Stefan Balthasar and Klenner, Lars Dietrich and Karl, Holger}}, booktitle = {{IEEE International Conference on Network and Service Management (CNSM)}}, keywords = {{distributed management, service coordination, network coordination, nfv, softwarization, orchestration}}, publisher = {{IEEE}}, title = {{{Every Node for Itself: Fully Distributed Service Coordination}}}, year = {{2020}}, } @inproceedings{19609, abstract = {{Modern services comprise interconnected components, e.g., microservices in a service mesh, that can scale and run on multiple nodes across the network on demand. To process incoming traffic, service components have to be instantiated and traffic assigned to these instances, taking capacities and changing demands into account. This challenge is usually solved with custom approaches designed by experts. While this typically works well for the considered scenario, the models often rely on unrealistic assumptions or on knowledge that is not available in practice (e.g., a priori knowledge). We propose a novel deep reinforcement learning approach that learns how to best coordinate services and is geared towards realistic assumptions. It interacts with the network and relies on available, possibly delayed monitoring information. Rather than defining a complex model or an algorithm how to achieve an objective, our model-free approach adapts to various objectives and traffic patterns. An agent is trained offline without expert knowledge and then applied online with minimal overhead. Compared to a state-of-the-art heuristic, it significantly improves flow throughput and overall network utility on real-world network topologies and traffic traces. It also learns to optimize different objectives, generalizes to scenarios with unseen, stochastic traffic patterns, and scales to large real-world networks.}}, author = {{Schneider, Stefan Balthasar and Manzoor, Adnan and Qarawlus, Haydar and Schellenberg, Rafael and Karl, Holger and Khalili, Ramin and Hecker, Artur}}, booktitle = {{IEEE International Conference on Network and Service Management (CNSM)}}, keywords = {{self-driving networks, self-learning, network coordination, service coordination, reinforcement learning, deep learning, nfv}}, publisher = {{IEEE}}, title = {{{Self-Driving Network and Service Coordination Using Deep Reinforcement Learning}}}, year = {{2020}}, } @inproceedings{17082, abstract = {{Data-parallel applications run on cluster of servers in a datacenter and their communication triggers correlated resource demand on multiple links that can be abstracted as coflow. They often desire predictable network performance, which can be passed to network via coflow abstraction for application-aware network scheduling. In this paper, we propose a heuristic and an optimization algorithm for predictable network performance such that they guarantee coflows completion within their deadlines. The algorithms also ensure high network utilization, i.e., it's work-conserving, and avoids starvation of coflows. We evaluate both algorithms via trace-driven simulation and show that they admit 1.1x more coflows than the Varys scheme while meeting their deadlines.}}, author = {{Hasnain, Asif and Karl, Holger}}, booktitle = {{2020 20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)}}, keywords = {{Coflow, Scheduling, Deadlines, Data centers}}, location = {{Melbourne, Australia}}, publisher = {{IEEE Computer Society}}, title = {{{Coflow Scheduling with Performance Guarantees for Data Center Applications}}}, doi = {{https://doi.org/10.1109/CCGrid49817.2020.00010}}, year = {{2020}}, } @inproceedings{16219, abstract = {{Network function virtualization (NFV) proposes to replace physical middleboxes with more flexible virtual network functions (VNFs). To dynamically adjust to everchanging traffic demands, VNFs have to be instantiated and their allocated resources have to be adjusted on demand. Deciding the amount of allocated resources is non-trivial. Existing optimization approaches often assume fixed resource requirements for each VNF instance. However, this can easily lead to either waste of resources or bad service quality if too many or too few resources are allocated. To solve this problem, we train machine learning models on real VNF data, containing measurements of performance and resource requirements. For each VNF, the trained models can then accurately predict the required resources to handle a certain traffic load. We integrate these machine learning models into an algorithm for joint VNF scaling and placement and evaluate their impact on resulting VNF placements. Our evaluation based on real-world data shows that using suitable machine learning models effectively avoids over- and underallocation of resources, leading to up to 12 times lower resource consumption and better service quality with up to 4.5 times lower total delay than using standard fixed resource allocation.}}, author = {{Schneider, Stefan Balthasar and Satheeschandran, Narayanan Puthenpurayil and Peuster, Manuel and Karl, Holger}}, booktitle = {{IEEE Conference on Network Softwarization (NetSoft)}}, location = {{Ghent, Belgium}}, publisher = {{IEEE}}, title = {{{Machine Learning for Dynamic Resource Allocation in Network Function Virtualization}}}, year = {{2020}}, } @inproceedings{16222, author = {{Zafeiropoulos, A. and Fotopoulou, E. and Peuster, Manuel and Schneider, Stefan Balthasar and Gouvas, P. and Behnke, D. and Müller, M. and Bök, P. and Trakadas, P. and Karkazis, P. and Karl, Holger}}, booktitle = {{IEEE Conference on Network Softwarization (NetSoft)}}, title = {{{Benchmarking and Profiling 5G Verticals' Applications: An Industrial IoT Use Case}}}, year = {{2020}}, } @article{16278, abstract = {{Currently, the coexistence of multiple users and devices challenges the network's ability to reliably connect them. This article proposes a novel communication architecture that satisfies the requirements of fifth-generation (5G) mobile network applications. In particular, this architecture extends and combines ultra-dense networking (UDN), multi-access edge computing (MEC), and virtual infrastructure manager (VIM) concepts to provide a flexible network of moving radio access (RA) nodes, flying or moving to areas where users and devices struggle for connectivity and data rate. Furthermore, advances in radio communications and non-orthogonal multiple access (NOMA), virtualization technologies and energy-awareness mechanisms are integrated towards a mobile UDN that not only allows RA nodes to follow the user but also enables the virtualized network functions (VNFs) to adapt to user mobility by migrating from one node to another. Performance evaluation shows that the underlying network improves connectivity of users and devices through the flexible deployment of moving RA nodes and the use of NOMA.}}, author = {{Nomikos, Nikolaos and Michailidis, Emmanouel T. and Trakadas, Panagiotis and Vouyioukas, Demosthenes and Karl, Holger and Martrat, Josep and Zahariadis, Theodore and Papadopoulos, Konstantinos and Voliotis, Stamatis}}, issn = {{2214-2096}}, journal = {{Vehicular Communications}}, title = {{{A UAV-based moving 5G RAN for massive connectivity of mobile users and IoT devices}}}, doi = {{10.1016/j.vehcom.2020.100250}}, year = {{2020}}, } @article{16280, abstract = {{Assigning bands of the wireless spectrum as resources to users is a common problem in wireless networks. Typically, frequency bands were assumed to be available in a stable manner. Nevertheless, in recent scenarios where wireless networks may be deployed in unknown environments, spectrum competition is considered, making it uncertain whether a frequency band is available at all or at what quality. To fully exploit such resources with uncertain availability, the multi-armed bandit (MAB) method, a representative online learning technique, has been applied to design spectrum scheduling algorithms. This article surveys such proposals. We describe the following three aspects: how to model spectrum scheduling problems within the MAB framework, what the main thread is following which prevalent algorithms are designed, and how to evaluate algorithm performance and complexity. We also give some promising directions for future research in related fields.}}, author = {{Li, Feng and Yu, Dongxiao and Yang, Huan and Yu, Jiguo and Karl, Holger and Cheng, Xiuzhen}}, issn = {{1536-1284}}, journal = {{IEEE Wireless Communications}}, pages = {{24--30}}, title = {{{Multi-Armed-Bandit-Based Spectrum Scheduling Algorithms in Wireless Networks: A Survey}}}, doi = {{10.1109/mwc.001.1900280}}, year = {{2020}}, } @inproceedings{16400, abstract = {{Softwarization facilitates the introduction of smart manufacturing applications in the industry. Manifold devices such as machine computers, Industrial IoT devices, tablets, smartphones and smart glasses are integrated into factory networks to enable shop floor digitalization and big data analysis. To handle the increasing number of devices and the resulting traffic, a flexible and scalable factory network is necessary which can be realized using softwarization technologies like Network Function Virtualization (NFV). However, the security risks increase with the increasing number of new devices, so that cyber security must also be considered in NFV-based networks. Therefore, extending our previous work, we showcase threat detection using a cloud-native NFV-driven intrusion detection system (IDS) that is integrated in our industrial-specific network services. As a result of the threat detection, the affected network service is put into quarantine via automatic network reconfiguration. We use the 5GTANGO service platform to deploy our developed network services on Kubernetes and to initiate the network reconfiguration.}}, author = {{Müller, Marcel and Behnke, Daniel and Bök, Patrick-Benjamin and Schneider, Stefan Balthasar and Peuster, Manuel and Karl, Holger}}, booktitle = {{IEEE Conference on Network Softwarization (NetSoft) Demo Track}}, location = {{Ghent, Belgium}}, publisher = {{IEEE}}, title = {{{Cloud-Native Threat Detection and Containment for Smart Manufacturing}}}, year = {{2020}}, } @article{13770, author = {{Karl, Holger and Kundisch, Dennis and Meyer auf der Heide, Friedhelm and Wehrheim, Heike}}, journal = {{Business & Information Systems Engineering}}, number = {{6}}, pages = {{467--481}}, publisher = {{Springer}}, title = {{{A Case for a New IT Ecosystem: On-The-Fly Computing}}}, doi = {{10.1007/s12599-019-00627-x}}, volume = {{62}}, year = {{2020}}, } @inproceedings{3287, abstract = {{For optimal placement and orchestration of network services, it is crucial that their structure and semantics are specified clearly and comprehensively and are available to an orchestrator. Existing specification approaches are either ambiguous or miss important aspects regarding the behavior of virtual network functions (VNFs) forming a service. We propose to formally and unambiguously specify the behavior of these functions and services using Queuing Petri Nets (QPNs). QPNs are an established method that allows to express queuing, synchronization, stochastically distributed processing delays, and changing traffic volume and characteristics at each VNF. With QPNs, multiple VNFs can be connected to complete network services in any structure, even specifying bidirectional network services containing loops. We discuss how management and orchestration systems can benefit from our clear and comprehensive specification approach, leading to better placement of VNFs and improved Quality of Service. Another benefit of formally specifying network services with QPNs are diverse analysis options, which allow valuable insights such as the distribution of end-to-end delay. We propose a tool-based workflow that supports the specification of network services and the automatic generation of corresponding simulation code to enable an in-depth analysis of their behavior and performance.}}, author = {{Schneider, Stefan Balthasar and Sharma, Arnab and Karl, Holger and Wehrheim, Heike}}, booktitle = {{2019 IFIP/IEEE International Symposium on Integrated Network Management (IM)}}, location = {{Washington, DC, USA}}, pages = {{116----124}}, publisher = {{IFIP}}, title = {{{Specifying and Analyzing Virtual Network Services Using Queuing Petri Nets}}}, year = {{2019}}, }