@article{59074,
  author       = {{Hu, Jiahai and Wang, Lin and Wu, Jing and Pei, Qiangyu and Liu, Fangming and Li, Bo}},
  issn         = {{1389-1286}},
  journal      = {{Computer Networks}},
  publisher    = {{Elsevier BV}},
  title        = {{{A Comparative Measurement Study of Cross-Layer 5G Performance Under Different Mobility Scenarios}}},
  doi          = {{10.1016/j.comnet.2024.110952}},
  volume       = {{257}},
  year         = {{2024}},
}

@article{63059,
  abstract     = {{<jats:title>Abstract</jats:title><jats:p>While high accuracy is of paramount importance for deep learning (DL) inference, serving inference requests on time is equally critical but has not been carefully studied especially when the request has to be served over a dynamic wireless network at the edge. In this paper, we propose Jellyfish—a novel edge DL inference serving system that achieves soft guarantees for end-to-end inference latency service-level objectives (SLO). Jellyfish handles the network variability by utilizing both data and deep neural network (DNN) adaptation to conduct tradeoffs between accuracy and latency. Jellyfish features a new design that enables collective adaptation policies where the decisions for data and DNN adaptations are aligned and coordinated among multiple users with varying network conditions. We propose efficient algorithms to continuously map users and adapt DNNs at runtime, so that we fulfill latency SLOs while maximizing the overall inference accuracy. We further investigate <jats:italic>dynamic</jats:italic> DNNs, i.e., DNNs that encompass multiple architecture variants, and demonstrate their potential benefit through preliminary experiments. Our experiments based on a prototype implementation and real-world WiFi and LTE network traces show that Jellyfish can meet latency SLOs at around the 99th percentile while maintaining high accuracy.
</jats:p>}},
  author       = {{Nigade, Vinod and Bauszat, Pablo and Bal, Henri and Wang, Lin}},
  issn         = {{0922-6443}},
  journal      = {{Real-Time Systems}},
  number       = {{2}},
  pages        = {{239--290}},
  publisher    = {{Springer Science and Business Media LLC}},
  title        = {{{Inference serving with end-to-end latency SLOs over dynamic edge networks}}},
  doi          = {{10.1007/s11241-024-09418-4}},
  volume       = {{60}},
  year         = {{2024}},
}

@article{63060,
  author       = {{Wu, Jing and Wang, Lin and Jin, Qirui and Liu, Fangming}},
  issn         = {{1045-9219}},
  journal      = {{IEEE Transactions on Parallel and Distributed Systems}},
  number       = {{2}},
  pages        = {{280--296}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{Graft: Efficient Inference Serving for Hybrid Deep Learning With SLO Guarantees via DNN Re-Alignment}}},
  doi          = {{10.1109/tpds.2023.3340518}},
  volume       = {{35}},
  year         = {{2023}},
}

@phdthesis{29672,
  author       = {{Schneider, Stefan Balthasar}},
  title        = {{{Network and Service Coordination: Conventional and Machine Learning Approaches"}}},
  doi          = {{10.17619/UNIPB/1-1276 }},
  year         = {{2022}},
}

@inproceedings{30236,
  abstract     = {{Recent reinforcement learning approaches for continuous control in wireless mobile networks have shown impressive
results. But due to the lack of open and compatible simulators, authors typically create their own simulation environments for training and evaluation. This is cumbersome and time-consuming for authors and limits reproducibility and comparability, ultimately impeding progress in the field.

To this end, we propose mobile-env, a simple and open platform for training, evaluating, and comparing reinforcement learning and conventional approaches for continuous control in mobile wireless networks. mobile-env is lightweight and implements the common OpenAI Gym interface and additional wrappers, which allows connecting virtually any single-agent or multi-agent reinforcement learning framework to the environment. While mobile-env provides sensible default values and can be used out of the box, it also has many configuration options and is easy to extend. We therefore believe mobile-env to be a valuable platform for driving meaningful progress in autonomous coordination of
wireless mobile networks.}},
  author       = {{Schneider, Stefan Balthasar and Werner, Stefan and Khalili, Ramin and Hecker, Artur and Karl, Holger}},
  booktitle    = {{IEEE/IFIP Network Operations and Management Symposium (NOMS)}},
  keywords     = {{wireless mobile networks, network management, continuous control, cognitive networks, autonomous coordination, reinforcement learning, gym environment, simulation, open source}},
  location     = {{Budapest}},
  publisher    = {{IEEE}},
  title        = {{{mobile-env: An Open Platform for Reinforcement Learning in Wireless Mobile Networks}}},
  year         = {{2022}},
}

@inproceedings{32811,
  abstract     = {{The decentralized nature of multi-agent systems requires continuous data exchange to achieve global objectives. In such scenarios, Age of Information (AoI) has become an important metric of the freshness of exchanged data due to the error-proneness and delays of communication systems. Communication systems usually possess dependencies: the process describing the success or failure of communication is highly correlated when these attempts are ``close'' in some domain (e.g. in time, frequency, space or code as in wireless communication) and is, in general, non-stationary. To study AoI in such scenarios, we consider an abstract event-based AoI process $\Delta(n)$, expressing time since the last update: If, at time $n$, a monitoring node receives a status update from a source node (event $A(n-1)$ occurs), then $\Delta(n)$ is reset to one; otherwise, $\Delta(n)$ grows linearly in time. This AoI process can thus be viewed as a special random walk with resets. The event process $A(n)$ may be nonstationary and we merely assume that its temporal dependencies decay sufficiently, described by $\alpha$-mixing. We calculate moment bounds for the resulting AoI process as a function of the mixing rate of $A(n)$. Furthermore, we prove that the AoI process $\Delta(n)$ is itself $\alpha$-mixing from which we conclude a strong law of large numbers for $\Delta(n)$. These results are new, since AoI processes have not been studied so far in this general strongly mixing setting. This opens up future work on renewal processes with non-independent interarrival times.}},
  author       = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}},
  booktitle    = {{Proceedings of the 58th Allerton Conference on Communication, Control, and Computing}},
  title        = {{{Age of Information Process under Strongly Mixing Communication -- Moment Bound, Mixing Rate and Strong Law}}},
  year         = {{2022}},
}

@inproceedings{30793,
  author       = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}},
  booktitle    = {{Proceedings of the 14th International Conference on Agents and Artificial Intelligence}},
  publisher    = {{SCITEPRESS - Science and Technology Publications}},
  title        = {{{Multi-agent Policy Gradient Algorithms for Cyber-physical Systems with Lossy Communication}}},
  doi          = {{10.5220/0010845400003116}},
  year         = {{2022}},
}

@unpublished{30790,
  abstract     = {{Iterative distributed optimization algorithms involve multiple agents that
communicate with each other, over time, in order to minimize/maximize a global
objective. In the presence of unreliable communication networks, the
Age-of-Information (AoI), which measures the freshness of data received, may be
large and hence hinder algorithmic convergence. In this paper, we study the
convergence of general distributed gradient-based optimization algorithms in
the presence of communication that neither happens periodically nor at
stochastically independent points in time. We show that convergence is
guaranteed provided the random variables associated with the AoI processes are
stochastically dominated by a random variable with finite first moment. This
improves on previous requirements of boundedness of more than the first moment.
We then introduce stochastically strongly connected (SSC) networks, a new
stochastic form of strong connectedness for time-varying networks. We show: If
for any $p \ge0$ the processes that describe the success of communication
between agents in a SSC network are $\alpha$-mixing with $n^{p-1}\alpha(n)$
summable, then the associated AoI processes are stochastically dominated by a
random variable with finite $p$-th moment. In combination with our first
contribution, this implies that distributed stochastic gradient descend
converges in the presence of AoI, if $\alpha(n)$ is summable.}},
  author       = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}},
  booktitle    = {{arXiv:2201.11343}},
  title        = {{{Distributed gradient-based optimization in the presence of dependent  aperiodic communication}}},
  year         = {{2022}},
}

@unpublished{30791,
  abstract     = {{We present sufficient conditions that ensure convergence of the multi-agent
Deep Deterministic Policy Gradient (DDPG) algorithm. It is an example of one of
the most popular paradigms of Deep Reinforcement Learning (DeepRL) for tackling
continuous action spaces: the actor-critic paradigm. In the setting considered
herein, each agent observes a part of the global state space in order to take
local actions, for which it receives local rewards. For every agent, DDPG
trains a local actor (policy) and a local critic (Q-function). The analysis
shows that multi-agent DDPG using neural networks to approximate the local
policies and critics converge to limits with the following properties: The
critic limits minimize the average squared Bellman loss; the actor limits
parameterize a policy that maximizes the local critic's approximation of
$Q_i^*$, where $i$ is the agent index. The averaging is with respect to a
probability distribution over the global state-action space. It captures the
asymptotics of all local training processes. Finally, we extend the analysis to
a fully decentralized setting where agents communicate over a wireless network
prone to delays and losses; a typical scenario in, e.g., robotic applications.}},
  author       = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}},
  booktitle    = {{arXiv:2201.00570}},
  title        = {{{Asymptotic Convergence of Deep Multi-Agent Actor-Critic Algorithms}}},
  year         = {{2022}},
}

@article{32854,
  author       = {{Redder, Adrian and Ramaswamy, Arunselvan and Karl, Holger}},
  journal      = {{IFAC-PapersOnLine}},
  number       = {{13}},
  pages        = {{133–138}},
  publisher    = {{Elsevier}},
  title        = {{{Practical Network Conditions for the Convergence of Distributed Optimization}}},
  volume       = {{55}},
  year         = {{2022}},
}

@inproceedings{29220,
  abstract     = {{Modern services often comprise several components, such as chained virtual network functions, microservices, or
machine learning functions. Providing such services requires to decide how often to instantiate each component, where to place these instances in the network, how to chain them and route traffic through them. 
To overcome limitations of conventional, hardwired heuristics, deep reinforcement learning (DRL) approaches for self-learning network and service management have emerged recently. These model-free DRL approaches are more flexible but typically learn tabula rasa, i.e., disregard existing understanding of networks, services, and their coordination. 

Instead, we propose FutureCoord, a novel model-based AI approach that leverages existing understanding of networks and services for more efficient and effective coordination without time-intensive training. FutureCoord combines Monte Carlo Tree Search with a stochastic traffic model. This allows FutureCoord to estimate the impact of future incoming traffic and effectively optimize long-term effects, taking fluctuating demand and Quality of Service (QoS) requirements into account. Our extensive evaluation based on real-world network topologies, services, and traffic traces indicates that FutureCoord clearly outperforms state-of-the-art model-free and model-based approaches with up to 51% higher flow success ratios.}},
  author       = {{Werner, Stefan and Schneider, Stefan Balthasar and Karl, Holger}},
  booktitle    = {{IEEE/IFIP Network Operations and Management Symposium (NOMS)}},
  keywords     = {{network management, service management, AI, Monte Carlo Tree Search, model-based, QoS}},
  location     = {{Budapest}},
  publisher    = {{IEEE}},
  title        = {{{Use What You Know: Network and Service Coordination Beyond Certainty}}},
  year         = {{2022}},
}

@inproceedings{20125,
  abstract     = {{Datacenter applications have different resource requirements from network and developing flow scheduling heuristics for every workload is practically infeasible. In this paper, we show that deep reinforcement learning (RL) can be used to efficiently learn flow scheduling policies for different workloads without manual feature engineering. Specifically, we present LFS, which learns to optimize a high-level performance objective, e.g., maximize the number of flow admissions while meeting the deadlines. The LFS scheduler is trained through deep RL to learn a scheduling policy on continuous online flow arrivals. The evaluation results show that the trained LFS scheduler admits 1.05x more flows than the greedy flow scheduling heuristics under varying network load.}},
  author       = {{Hasnain, Asif and Karl, Holger}},
  booktitle    = {{2021 IEEE 18th Annual Consumer Communications & Networking Conference (CCNC)}},
  keywords     = {{Flow scheduling, Deadlines, Reinforcement learning}},
  location     = {{Las Vegas, USA}},
  publisher    = {{IEEE Computer Society}},
  title        = {{{Learning Flow Scheduling}}},
  doi          = {{https://doi.org/10.1109/CCNC49032.2021.9369514}},
  year         = {{2021}},
}

@phdthesis{27503,
  author       = {{Hasnain, Asif}},
  title        = {{{Automating Network Resource Allocation for Coflows with Deadlines}}},
  doi          = {{10.17619/UNIPB/1-1241 }},
  year         = {{2021}},
}

@inproceedings{21005,
  abstract     = {{Data-parallel applications are developed using different data programming models, e.g., MapReduce, partition/aggregate. These models represent diverse resource requirements of application in a datacenter network, which can be represented by the coflow abstraction. The conventional method of creating hand-crafted coflow heuristics for admission or scheduling for different workloads is practically infeasible. In this paper, we propose a deep reinforcement learning (DRL)-based coflow admission scheme -- LCS -- that can learn an admission policy for a higher-level performance objective, i.e., maximize successful coflow admissions, without manual feature engineering.  LCS is trained on a production trace, which has online coflow arrivals. The evaluation results show that LCS is able to learn a reasonable admission policy that admits more coflows than state-of-the-art Varys heuristic while meeting their deadlines.}},
  author       = {{Hasnain, Asif and Karl, Holger}},
  booktitle    = {{IEEE INFOCOM 2021 - IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS)}},
  keywords     = {{Coflow scheduling, Reinforcement learning, Deadlines}},
  location     = {{Vancouver BC Canada}},
  publisher    = {{IEEE Communications Society}},
  title        = {{{Learning Coflow Admissions}}},
  doi          = {{10.1109/INFOCOMWKSHPS51825.2021.9484599}},
  year         = {{2021}},
}

@inproceedings{21543,
  abstract     = {{Services often consist of multiple chained components such as microservices in a service mesh, or machine learning functions in a pipeline. Providing these services requires online coordination including scaling the service, placing instance of all components in the network, scheduling traffic to these instances, and routing traffic through the network. Optimized service coordination is still a hard problem due to many influencing factors such as rapidly arriving user demands and limited node and link capacity. Existing approaches to solve the problem are often built on rigid models and assumptions, tailored to specific scenarios. If the scenario changes and the assumptions no longer hold, they easily break and require manual adjustments by experts. Novel self-learning approaches using deep reinforcement learning (DRL) are promising but still have limitations as they only address simplified versions of the problem and are typically centralized and thus do not scale to practical large-scale networks.

To address these issues, we propose a distributed self-learning service coordination approach using DRL. After centralized training, we deploy a distributed DRL agent at each node in the network, making fast coordination decisions locally in parallel with the other nodes. Each agent only observes its direct neighbors and does not need global knowledge. Hence, our approach scales independently from the size of the network. In our extensive evaluation using real-world network topologies and traffic traces, we show that our proposed approach outperforms a state-of-the-art conventional heuristic as well as a centralized DRL approach (60% higher throughput on average) while requiring less time per online decision (1 ms).}},
  author       = {{Schneider, Stefan Balthasar and Qarawlus, Haydar and Karl, Holger}},
  booktitle    = {{IEEE International Conference on Distributed Computing Systems (ICDCS)}},
  keywords     = {{network management, service management, coordination, reinforcement learning, distributed}},
  location     = {{Washington, DC, USA}},
  publisher    = {{IEEE}},
  title        = {{{Distributed Online Service Coordination Using Deep Reinforcement Learning}}},
  year         = {{2021}},
}

@inproceedings{20693,
  abstract     = {{In practical, large-scale networks, services are requested
by users across the globe, e.g., for video streaming.
Services consist of multiple interconnected components such as
microservices in a service mesh. Coordinating these services
requires scaling them according to continuously changing user
demand, deploying instances at the edge close to their users,
and routing traffic efficiently between users and connected instances.
Network and service coordination is commonly addressed
through centralized approaches, where a single coordinator
knows everything and coordinates the entire network globally.
While such centralized approaches can reach global optima, they
do not scale to large, realistic networks. In contrast, distributed
approaches scale well, but sacrifice solution quality due to their
limited scope of knowledge and coordination decisions.

To this end, we propose a hierarchical coordination approach
that combines the good solution quality of centralized approaches
with the scalability of distributed approaches. In doing so, we divide
the network into multiple hierarchical domains and optimize
coordination in a top-down manner. We compare our hierarchical
with a centralized approach in an extensive evaluation on a real-world
network topology. Our results indicate that hierarchical
coordination can find close-to-optimal solutions in a fraction of
the runtime of centralized approaches.}},
  author       = {{Schneider, Stefan Balthasar and Jürgens, Mirko and Karl, Holger}},
  booktitle    = {{IFIP/IEEE International Symposium on Integrated Network Management (IM)}},
  keywords     = {{network management, service management, coordination, hierarchical, scalability, nfv}},
  location     = {{Bordeaux, France}},
  publisher    = {{IFIP/IEEE}},
  title        = {{{Divide and Conquer: Hierarchical Network and Service Coordination}}},
  year         = {{2021}},
}

@article{21808,
  abstract     = {{Modern services consist of interconnected components,e.g., microservices in a service mesh or machine learning functions in a pipeline. These services can scale and run across multiple network nodes on demand. To process incoming traffic, service components have to be instantiated and traffic assigned to these instances, taking capacities, changing demands, and Quality of Service (QoS) requirements into account. This challenge is usually solved with custom approaches designed by experts. While this typically works well for the considered scenario, the models often rely on unrealistic assumptions or on knowledge that is not available in practice (e.g., a priori knowledge).

We propose DeepCoord, a novel deep reinforcement learning approach that learns how to best coordinate services and is geared towards realistic assumptions. It interacts with the network and relies on available, possibly delayed monitoring information. Rather than defining a complex model or an algorithm on how to achieve an objective, our model-free approach adapts to various objectives and traffic patterns. An agent is trained offline without expert knowledge and then applied online with minimal overhead. Compared to a state-of-the-art heuristic, DeepCoord significantly improves flow throughput (up to 76%) and overall network utility (more than 2x) on realworld network topologies and traffic traces. It also supports optimizing multiple, possibly competing objectives, learns to respect QoS requirements, generalizes to scenarios with unseen, stochastic traffic, and scales to large real-world networks. For reproducibility and reuse, our code is publicly available.}},
  author       = {{Schneider, Stefan Balthasar and Khalili, Ramin and Manzoor, Adnan and Qarawlus, Haydar and Schellenberg, Rafael and Karl, Holger and Hecker, Artur}},
  journal      = {{Transactions on Network and Service Management}},
  keywords     = {{network management, service management, coordination, reinforcement learning, self-learning, self-adaptation, multi-objective}},
  publisher    = {{IEEE}},
  title        = {{{Self-Learning Multi-Objective Service Coordination Using Deep Reinforcement Learning}}},
  doi          = {{10.1109/TNSM.2021.3076503}},
  year         = {{2021}},
}

@techreport{33854,
  abstract     = {{Macrodiversity is a key technique to increase the capacity of mobile networks. It can be realized using coordinated multipoint (CoMP), simultaneously connecting users to multiple overlapping cells. Selecting which users to serve by how many and which cells is NP-hard but needs to happen continuously in real time as users move and channel state changes. Existing approaches often require strict assumptions about or perfect knowledge of the underlying radio system, its resource allocation scheme, or user movements, none of which is readily available in practice.

Instead, we propose three novel self-learning and self-adapting approaches using model-free deep reinforcement learning (DRL): DeepCoMP, DD-CoMP, and D3-CoMP. DeepCoMP leverages central observations and control of all users to select cells almost optimally. DD-CoMP and D3-CoMP use multi-agent DRL, which allows distributed, robust, and highly scalable coordination. All three approaches learn from experience and self-adapt to varying scenarios, reaching 2x higher Quality of Experience than other approaches. They have very few built-in assumptions and do not need prior system knowledge, making them more robust to change and better applicable in practice than existing approaches.}},
  author       = {{Schneider, Stefan Balthasar and Karl, Holger and Khalili, Ramin and Hecker, Artur}},
  keywords     = {{mobility management, coordinated multipoint, CoMP, cell selection, resource management, reinforcement learning, multi agent, MARL, self-learning, self-adaptation, QoE}},
  title        = {{{DeepCoMP: Coordinated Multipoint Using Multi-Agent Deep Reinforcement Learning}}},
  year         = {{2021}},
}

@techreport{35889,
  abstract     = {{Network and service coordination is important to provide modern services consisting of multiple interconnected components, e.g., in 5G, network function virtualization (NFV), or cloud and edge computing. In this paper, I outline my dissertation research, which proposes six approaches to automate such network and service coordination. All approaches dynamically react to the current demand and optimize coordination for high service quality and low costs. The approaches range from centralized to distributed methods and from conventional heuristic algorithms and mixed-integer linear programs to machine learning approaches using supervised and reinforcement learning. I briefly discuss their main ideas and advantages over other state-of-the-art approaches and compare strengths and weaknesses.}},
  author       = {{Schneider, Stefan Balthasar}},
  keywords     = {{nfv, coordination, machine learning, reinforcement learning, phd, digest}},
  title        = {{{Conventional and Machine Learning Approaches for Network and Service Coordination}}},
  year         = {{2021}},
}

@inproceedings{19607,
  abstract     = {{Modern services consist of modular, interconnected
components, e.g., microservices forming a service mesh. To
dynamically adjust to ever-changing service demands, service
components have to be instantiated on nodes across the network.
Incoming flows requesting a service then need to be routed
through the deployed instances while considering node and link
capacities. Ultimately, the goal is to maximize the successfully
served flows and Quality of Service (QoS) through online service
coordination. Current approaches for service coordination are
usually centralized, assuming up-to-date global knowledge and
making global decisions for all nodes in the network. Such global
knowledge and centralized decisions are not realistic in practical
large-scale networks.

To solve this problem, we propose two algorithms for fully
distributed service coordination. The proposed algorithms can be
executed individually at each node in parallel and require only
very limited global knowledge. We compare and evaluate both
algorithms with a state-of-the-art centralized approach in extensive
simulations on a large-scale, real-world network topology.
Our results indicate that the two algorithms can compete with
centralized approaches in terms of solution quality but require
less global knowledge and are magnitudes faster (more than
100x).}},
  author       = {{Schneider, Stefan Balthasar and Klenner, Lars Dietrich and Karl, Holger}},
  booktitle    = {{IEEE International Conference on Network and Service Management (CNSM)}},
  keywords     = {{distributed management, service coordination, network coordination, nfv, softwarization, orchestration}},
  publisher    = {{IEEE}},
  title        = {{{Every Node for Itself: Fully Distributed Service Coordination}}},
  year         = {{2020}},
}

