<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">73442</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.073442</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A Robot Grasp Detection Method Based on Neural Architecture Search and Its Interpretability Analysis</article-title>
<alt-title alt-title-type="left-running-head">A Robot Grasp Detection Method Based on Neural Architecture Search and Its Interpretability Analysis</alt-title>
<alt-title alt-title-type="right-running-head">A Robot Grasp Detection Method Based on Neural Architecture Search and Its Interpretability Analysis</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Rong</surname><given-names>Lu</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Xu</surname><given-names>Manyu</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-3" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Zhu</surname><given-names>Wenbo</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref rid="cor1" ref-type="corresp">&#x002A;</xref><email>zhuwenbo@fosu.edu.cn</email></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Yang</surname><given-names>Zhihao</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Dong</surname><given-names>Chao</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-4">4</xref><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Zhang</surname><given-names>Yunzhi</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Wang</surname><given-names>Kai</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-8" contrib-type="author">
<name name-style="western"><surname>Zheng</surname><given-names>Bing</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-4">4</xref><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<aff id="aff-1"><label>1</label><institution>South China Sea Marine Survey Center, Ministry of Natural Resources of the People&#x2019;s Republic of China</institution>, <addr-line>Guangzhou, 510300</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>School of Mechanical Engineering and Automation, Foshan University</institution>, <addr-line>Foshan, 528200</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Guangdong Provincial Key Laboratory of Industrial Intelligent Inspection Technology, Foshan University</institution>, <addr-line>Foshan, 528000</addr-line>, <country>China</country></aff>
<aff id="aff-4"><label>4</label><institution>Key Laboratory of Marine Environmental Survey Technology and Application, Ministry of Natural Resources of the People&#x2019;s Republic of China</institution>, <addr-line>Guangzhou, 510300</addr-line>, <country>China</country></aff>
<aff id="aff-5"><label>5</label><institution>Southern Marine Science and Engineering Guangdong Laboratory (Zhuhai)</institution>, <addr-line>Zhuhai, 519000</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Wenbo Zhu. Email: <email>zhuwenbo@fosu.edu.cn</email></corresp>
<fn id="afn1">
<p><sup>#</sup>These authors contributed equally to this work</p>
</fn>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>52</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_73442.pdf"></self-uri>
<abstract>
<p>Deep learning has become integral to robotics, particularly in tasks such as robotic grasping, where objects often exhibit diverse shapes, textures, and physical properties. In robotic grasping tasks, due to the diverse characteristics of the targets, frequent adjustments to the network architecture and parameters are required to avoid a decrease in model accuracy, which presents a significant challenge for non-experts. Neural Architecture Search (NAS) provides a compelling method through the automated generation of network architectures, enabling the discovery of models that achieve high accuracy through efficient search algorithms. Compared to manually designed networks, NAS methods can significantly reduce design costs, time expenditure, and improve model performance. However, such methods often involve complex topological connections, and these redundant structures can severely reduce computational efficiency. To overcome this challenge, this work puts forward a robotic grasp detection framework founded on NAS. The method automatically designs a lightweight network with high accuracy and low topological complexity, effectively adapting to the target object to generate the optimal grasp pose, thereby significantly improving the success rate of robotic grasping. Additionally, we use Class Activation Mapping (CAM) as an interpretability tool, which captures sensitive information during the perception process through visualized results. The searched model achieved competitive, and in some cases superior, performance on the Cornell and Jacquard public datasets, achieving accuracies of 98.3% and 96.8%, respectively, while sustaining a detection speed of 89 frames per second with only 0.41 million parameters. To further validate its effectiveness beyond benchmark evaluations, we conducted real-world grasping experiments on a UR5 robotic arm, where the model demonstrated reliable performance across diverse objects and high grasp success rates, thereby confirming its practical applicability in robotic manipulation tasks.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Robotics</kwd>
<kwd>grasping detection</kwd>
<kwd>neural architecture search</kwd>
<kwd>neural network interpretability</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Guangdong Basic and Applied Basic Research Foundation</funding-source>
<award-id>2023B1515120064</award-id>
</award-group>
<award-group id="awg2">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>62273097</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>In robotic grasping scenarios, manually designed object grasping detection networks are typically tailored for a specific set or category of objects. When faced with objects rapidly passing through a conveyor belt, these objects vary in contours, textures, glossiness, and other physical characteristics. Variations in external object features introduce discrepancies between the data distribution used during training and that encountered in real-world scenarios, a phenomenon commonly termed concept drift. Consequently, the decline in robustness of the model occurs as it encounters difficulties in generalizing and adapting to the external features of unseen objects. When the external features of the objects deviate beyond the scope of the training data of model, it significantly impacts the grasping success rate and challenges the accuracy and stability of the recognition capabilities of the system. While techniques for noise reduction, including Laplacian operators, wavelet thresholding, and Gaussian filtering, can reduce the loss of accuracy, they are often insufficient to ensure system performance when concept drift exceeds the training range of the model. For the concept drift in robotic grasping&#x2014;stemming from changes in object appearance, lighting, or wear in the gripper&#x2014;can degrade grasping performance. While deep-learning models can be retrained or fine-tuned to recover accuracy, frequent manual intervention is impractical on the shop floor. Consequently, developing lightweight, autonomous adaptation mechanisms that can detect and mitigate drift without requiring expert intervention or costly retraining has become a critical challenge for reliable long-term deployment.</p>
<p>Neural Architecture Search (NAS) represents a core component of Automated Machine Learning (AutoML) [<xref ref-type="bibr" rid="ref-1">1</xref>] focused on automatically designing deep neural network models with outstanding performance, achieving optimal results for target tasks. NAS has grown rapidly in recent years and is now used in a variety of domains such as computer vision [<xref ref-type="bibr" rid="ref-1">1</xref>&#x2013;<xref ref-type="bibr" rid="ref-4">4</xref>], speech processing [<xref ref-type="bibr" rid="ref-5">5</xref>&#x2013;<xref ref-type="bibr" rid="ref-7">7</xref>], and natural language processing (NLP) [<xref ref-type="bibr" rid="ref-8">8</xref>,<xref ref-type="bibr" rid="ref-9">9</xref>]. The research on NAS holds significant theoretical and practical value. First, NAS can rapidly locate the best network architecture through automatic search, enhancing model performance while saving substantial time spent on manual architecture and parameter adjustments. Second, NAS technology does not require specialized knowledge in relevant fields, making it widely accessible to non-experts. However, Neural Architecture Search (NAS) frequently yields networks with numerous parameters and redundant structures, leading to high memory consumption and decreased operational efficiency, particularly in resource-constrained computing environments. In response, researchers have proposed various methods to improve search efficiency and model performance. Hardware-aware NAS methods effectively balance accuracy with hardware constraints. The study in [<xref ref-type="bibr" rid="ref-10">10</xref>] introduces a new hardware-aware NAS framework that considers the suitability of network layer features for hardware mapping, enhancing deployment efficiency. The study in [<xref ref-type="bibr" rid="ref-11">11</xref>] presents a fast hardware-aware NAS method&#x2014;S<sup>3</sup>NAS, that can find networks with better delay-accuracy trade-offs than state-of-the-art (SOTA) networks through three steps. The study in [<xref ref-type="bibr" rid="ref-12">12</xref>] introduces a streamlined and scalable hardware-aware NAS framework designed to enhance the precision of optical networks while maintaining the operational efficiency of the target hardware within a two-stage process. Furthermore, NAS methods based on evolutionary algorithms, reinforcement learning, Bayesian optimization, and gradient-based updates likewise confer comparable advantages.</p>
<p>Complex network architectures, characterized by numerous interconnected nodes and edges, not only complicate training by increasing the burden of parameter updates and computations but also demand substantially greater computational resources during deployment. The study in [<xref ref-type="bibr" rid="ref-13">13</xref>] introduced a differentiable neural architecture search (NAS) approach that incorporates computational complexity constraints, resulting in a sparse topology for the discovered architecture. At the same time, the study in [<xref ref-type="bibr" rid="ref-14">14</xref>] employs a binary gating strategy to enable partial channel connections, thereby establishing a sparse connection mode to optimize the efficiency of the search procedure. This method, proposed in reference [<xref ref-type="bibr" rid="ref-15">15</xref>] introduces a technique for directing differentiable channel pruning using an attention mechanism, facilitating the regulation of network depth and width. Therefore, it is clear that when searching for and generating compact networks, network complexity must often be considered.</p>
<p>Deep learning has significantly improved performance across diverse tasks, yielding transformative results across a wide range of domains, including image recognition, natural language processing, and decision-making systems. However, these models typically contain millions of parameters and complex nonlinear structures, making their decision-making process a black box that is difficult for humans to understand and interpret. This opacity not only undermines trust in the models but also raises concerns about the rationale and fairness of their decisions. In recent years, the interpretability of models has been a focal point of attention. The objective of neural network interpretability is to uncover the internal mechanisms of the model and offer an intuitive understanding of its decision-making process. Such research offers insights into how the model extracts features from input data and makes decisions, thereby helping to alleviate the black-box problem. A clear insight into deep learning models plays a key role in optimizing network architectures and enhancing their robustness in real-world applications. To address this need, we investigate the internal mechanisms of visual models using the Class Activation Mapping (CAM) framework [<xref ref-type="bibr" rid="ref-16">16</xref>].</p>
<p>The main contributions of this study are summarized as follows:
<list list-type="simple">
<list-item><label>(1)</label><p>The automated design of robot grasp detection networks is achieved by integrating neural architecture search (NAS) with robotics technology, which greatly improves the efficiency of network design.</p></list-item>
<list-item><label>(2)</label><p>An improved NAS approach is proposed by introducing an additional attention-based search space optimization method, addressing the issues of complex feature extraction and redundant structures caused by stacking multiple normal cells, thus effectively enhancing the network search efficiency.</p></list-item>
<list-item><label>(3)</label><p>We evaluate the comprehensive performance of the searched models by deploying a concise and efficient network on a UR5 robotic arm for physical grasping experiments. Experimental results verify the validity and robustness of our model. Moreover, Class Activation Maps (CAM) are employed to enhance the interpretability and credibility in the outputs of the model.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Robotic Grasp Detection Technology</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Traditional Robot Grasp Detection Methods</title>
<p>Early research on robot grasp detection predominantly utilized analytical and heuristic approaches. Analytical methods rigorously model the physical interaction between the gripper and the object. As reviewed in [<xref ref-type="bibr" rid="ref-17">17</xref>], these methods involve the selection of finger positions and hand configurations guided by kinematic and dynamic equations pertaining to grasp stability and task specifications. Seminal work in this area, such as that of Ferrari and Canny [<xref ref-type="bibr" rid="ref-18">18</xref>], focused on planning optimal grasps based on force closure criteria. However, Analytical approaches typically require comprehensive and precise knowledge of the physical characteristics of object (e.g., geometry, mass, friction), which is often unavailable in real-world scenarios where objects are novel, partially occluded, or have unknown properties.</p>
<p>Heuristic methods, in contrast, often rely on empirical rules and simplified models. As surveyed in [<xref ref-type="bibr" rid="ref-19">19</xref>,<xref ref-type="bibr" rid="ref-20">20</xref>], these approaches are frequently based on the geometric structure of the target object and utilize pre-defined grasping strategies or templates, such as aligning a parallel-jaw gripper with antipodal points on a contour of object. Yet, Heuristic method, while computationally efficient for known objects, are inherently limited by their pre-defined rules and templates. They lack the adaptability to generalize across the vast diversity of object shapes and configurations encountered outside of controlled settings, making them brittle in the face of novelty.</p>
<p>The limitations of these model-based and rule-driven approaches&#x2014;specifically, their inability to handle perceptual uncertainty and generalize to novel objects&#x2014;motivated the paradigm shift towards data-driven solutions, particularly those leveraging deep learning, which can learn robust grasping strategies directly from sensory data.</p>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Deep Learning-Based Method for Robot Grasp Detection</title>
<p>With the rapid development of deep learning, more and more people have applied it to robot grasping and detection, making it a prominent research focus [<xref ref-type="bibr" rid="ref-21">21</xref>]. Most deep learning methods learn object feature representations and grasping strategies from a large amount of training data, thereby being able to automatically grasp different types of objects. Early deep learning approaches, such as those presented in [<xref ref-type="bibr" rid="ref-22">22</xref>], demonstrated the feasibility of using Convolutional Neural Networks (CNNs) to directly predict grasp configurations from RGB or RGB-D images, achieving significant improvements over traditional methods. However, these models limited their ability to capture complex features, and their performance was constrained by the scale and diversity of the training datasets available at the time.</p>
<p>Subsequent research has explored various architectures and input modalities to enhance performance. For instance, methods based on 3D point clouds [<xref ref-type="bibr" rid="ref-23">23</xref>] aim to leverage richer geometric information, potentially improving grasp accuracy in cluttered environments. Nevertheless, processing 3D point clouds is computationally expensive and often requires non-trivial data pre-processing, posing challenges for real-time deployment on robotic systems with limited computational resources. Similarly, approaches using rendered RGB-D images for pose regression [<xref ref-type="bibr" rid="ref-24">24</xref>] sought to fuse color and depth information. Yet, their performance can be highly sensitive to the quality and calibration of the depth sensors, and the fusion strategy may not fully exploit the complementary nature of the two modalities.</p>
<p>Furthermore, architectural innovations such as the incorporation of multi-scale spatial pyramid modules (MSSPM) [<xref ref-type="bibr" rid="ref-25">25</xref>] and hierarchical feature fusion methods [<xref ref-type="bibr" rid="ref-26">26</xref>] have been employed to enhance the precision and reliability of robotic grasping by capturing features at different scales and resolutions. While effective, these manually designed architectural enhancements often increase the complexity of model, and parameter count. This introduces a significant engineering burden, as designing optimal structures requires extensive domain expertise and iterative trial-and-error, and the resulting models may still suffer from information redundancy or inefficient feature propagation.</p>
<p>Recent studies such as GoalGrasp [<xref ref-type="bibr" rid="ref-27">27</xref>] have proposed that grasping and positioning in occlusion scenarios can be achieved through target semantic reasoning without grasping training, significantly enhancing the zero-shot generalization ability for new objects. However, its control over grasping accuracy and stability is weaker than that of the method proposed in this paper.</p>
<p>The challenges solved in robot grasping and detection, such as the robustness requirements for object diversity and real-time performance, are in line with a wider range of computer vision applications. For instance, in autonomous systems, Emin G&#x00FC;ney et al. [<xref ref-type="bibr" rid="ref-28">28</xref>] developed a computer vision-based charging controller for shore-based robots, which shares the same precise visual servo and real-time decision-making requirements as robot grasping. Similarly, the advancements in general object detection directly provide guidance for grasping detection methods. The research of reference [<xref ref-type="bibr" rid="ref-29">29</xref>] on using faster R-CNN to detect traffic signs demonstrated the powerful ability of regional proposal networks in accurately locating objects of interest in the scene. Although our task is to predict the directional grasping of rectangles rather than horizontal bounding boxes, the fundamental principle of using deep neural networks for precise spatial positioning is a common thread. Our approach based on network architecture can be regarded as a development of these concepts. In this approach, the network architecture itself is end-to-end optimized for the specific task of capturing rectangle detection, which may be superior in terms of efficiency and accuracy to those artificially designed backbone networks used in Faster R-CNN.</p>
<p>The framework proposed in this paper also falls within the category of deep learning-based capture and detection methods, however, its uniqueness lies in the adoption of Neural Architecture Search (NAS) technology to achieve an automated design process. Unlike manually designed architectures such as ResNet-based networks [<xref ref-type="bibr" rid="ref-30">30</xref>], or specially designed architectures like SE-ResUNet [<xref ref-type="bibr" rid="ref-31">31</xref>], our method does not presuppose a fixed backbone network or connection pattern. On the contrary, it will automatically discover the optimal network topology from a predefined search space, which includes convolution operations, attention mechanisms, and hybrid CNN-Transformer modules. This NAS-driven approach reduces human bias and engineering workload, while explicitly optimizing for accuracy and efficiency&#x2014;a key consideration for real-time robotics applications.</p>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Attention Mechanism</title>
<p>Neural network attention mechanisms are motivated by human cognitive processes, enhancing task performance by selectively emphasizing the most relevant input features. These mechanisms improve information processing in terms of flexibility and accuracy through the assignment of varying weights to input elements. In image classification, as network models have expanded and deepened, the expressive power of Convolutional Neural Networks (CNNs) has significantly improved, and numerous studies have focused on exploring deeper and broader trainable structures [<xref ref-type="bibr" rid="ref-32">32</xref>].</p>
<p>The core operation of CNNs is convolution, through which the network can capture rich spatial and channel information. However, some of this information may be irrelevant or interfere with key features, which can negatively impact network performance, increase computational load, and reduce detection accuracy. Therefore, incorporating attention mechanisms to filter redundant information and selectively emphasize critical features can enhance the performance of neural network models.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Neural Network Architecture Search</title>
<p>Recent developments in deep learning have highlighted Neural Architecture Search (NAS) as an important research direction. It aims to automatically generate high-performance neural networks using limited computational resources, without the need for manual intervention throughout the search process.</p>
<p>As shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, Neural Architecture Search (NAS) comprises three primary components: the search space, search strategy, and performance evaluation. NAS employs a search strategy to explore network architectures within a predefined search space and identifies the optimal one based on performance evaluation. The search space characterizes the entirety of feasible network architectures, represented by diverse arrangements of stacked neural units. It encompasses all potential layers, connections, and operations&#x2014;such as convolution, pooling, and fully connected layers&#x2014;and their combinations. The design of the search space plays a pivotal role in determining the performance of a NAS algorithm, as it establishes the algorithm&#x2019;s degrees of freedom and, to some extent, constrains its achievable performance ceiling. Search strategies determine the algorithms employed to efficiently identify optimal network architectures and parameter configurations. The principal approaches currently include Evolutionary Algorithms (EA), Reinforcement Learning (RL), and Gradient-based Update (GU) methods. EA-based neural architecture search [<xref ref-type="bibr" rid="ref-33">33</xref>] simulates the biological evolutionary mechanism and does not rely on gradient information, which allows for faster convergence in the early stages of the search, but requires significant computational resources. RL-based algorithms [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-34">34</xref>] use recurrent networks to construct a model representation of the neural network. These algorithms employ reinforcement learning to optimize the Recurrent Neural Network (RNN) towards enhancing the anticipated accuracy of the resultant architecture on the validation dataset. Nevertheless, RL techniques are associated with substantial computational expenses. Conversely, GU-based algorithms offer a more computationally economical alternative. Differentiable Architecture Search (DARTS) [<xref ref-type="bibr" rid="ref-35">35</xref>], is a gradient-update-based NAS algorithm that utilizes gradient descent based on architecture continuous relaxation to perform efficient architecture search. Although GU-based algorithms are computationally efficient, gradient-based optimization methods tend to suffer from high memory usage and inappropriate relaxation when applied. Furthermore, since most existing gradient-based algorithms rely on generational super networks, these algorithms require substantial domain-specific knowledge for effective super network construction. The final component of NAS is performance evaluation, which defines how to assess the performance of candidate architectures to more efficiently identify the best network architecture, such as through network mapping [<xref ref-type="bibr" rid="ref-36">36</xref>].</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>NAS Search Flowchart</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-1.tif"/>
</fig>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Class Activation Map</title>
<p>Since it was introduced in [<xref ref-type="bibr" rid="ref-37">37</xref>], Class Activation Mapping (CAM) has attracted widespread attention in computer vision due to its simplicity and insightful nature. CAM is essentially a heatmap that highlights regions in an image that are critical for predicting a specific class, offering a clear visualization of the decision-making process of the neural network. This visualization technique effectively reveals the key areas that the model focuses on, thus offering interpretability of the decision-making process of the neural network. Initially, CAM was generated through a linear combination of feature maps. This approach later led to the development of several related means, covering Grad-CAM, Score-CAM, and Layer-CAM.</p>
<p>For deep convolutional neural networks, after multiple convolution and pooling operations, the final convolutional layer retains both rich spatial features and high-level semantic information. However, the subsequent fully connected and SoftMax layers produce highly abstract features, making them difficult to visualize directly. Therefore, to effectively interpret CNN classification decisions, it is crucial to extract and utilize interpretable features embedded within the last convolutional layer. Inspired by the approach presented in [<xref ref-type="bibr" rid="ref-38">38</xref>], Class Activation Maps (CAM) replaces conventional fully connected layers with Global Average Pooling (GAP). GAP computes the mean of each feature map in the final convolutional layer and generates the network output through a weighted summation. By effectively leveraging spatial information and eliminating the numerous parameters associated with fully connected layers, GAP reduces the risk of overfitting and enhances model robustness.</p>
<p>Class Activation Mapping (CAM) provides a simple and effective method to visualize the regions in the input image that have the greatest impact on the capture prediction of model. By replacing the fully connected layer with a global average pooling layer, CAM generates an intuitive heat map highlighting significant areas, thereby enhancing the transparency of the grab detection process. This is particularly useful for identifying potential failure modes. For instance, when the model wrongly focuses on irrelevant background areas rather than the object itself. This visual feedback provides valuable guidance for debugging and improving the network architecture.</p>
<p>However, CAM also has its inherent limitations. Firstly, CAM usually relies on the final convolutional layer. Due to the previous downsampling operation, this layer has a reduced spatial resolution. As a result, the generated heatmaps are often relatively rough and may lack precise pixel-level positioning, especially for small or slender objects. Secondly, CAM was originally designed for classification tasks and requires careful adjustment to be used for dense prediction tasks like object detection. Although extending the class activation map (CAM) for visualization helps to visualize the regions involved in the object capture decision, it cannot directly explain regression parameters such as capture angle or width. Thirdly, CAM mainly answers the question &#x201C;where is the model looking&#x201D;, but cannot answer &#x201C;why&#x201D; these areas are considered important. It does not provide causal or semantic insights into the decision-making process of the model. To address these shortcomings, future work will explore more advanced interpretability techniques, providing finer spatial resolution and deeper interpretability capabilities for the classification and regression components of the model.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Grasping Pose Representation</title>
<p>To generate precise grasping configurations, it is essential to explicitly represent grasping poses. A simple and clear five-dimensional representation method for describing 2D grasps of parallel grippers has been proposed [<xref ref-type="bibr" rid="ref-22">22</xref>], which constructs a gripper rectangle through five parameters. See <xref ref-type="disp-formula" rid="eqn-1">Eq. (1)</xref> for an example:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mi>G</mml:mi><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03B8;</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula>here, <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> denotes the centroid of the gripper, <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> represents its angular orientation relative to the horizontal axis, and <italic>h</italic> and <italic>w</italic> correspond to the gripper&#x2019;s height and width, respectively. Morrison [<xref ref-type="bibr" rid="ref-30">30</xref>] optimized this grasping definition by disregarding the height parameter and introduced a grasping quality metric to denote the probability of successful grasping, as <xref ref-type="disp-formula" rid="eqn-2">Eq. (2)</xref> below:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mrow><mml:mover><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula></p>
<p>The variable <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is utilized as a score to assess the quality of the captured image, with a range of [0, 1]. On the other hand, <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> represents the rotation angle within the camera reference system and takes values within the range [&#x2212;90, 90] degrees. <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the grasp width in image coordinates, with values ranging from [0, <italic>W</italic><sub>max</sub>]. The coordinates <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> indicate the centroid of the grasp configuration. To project the grasp representation from the 2D image plane to the coordinate frame of the robot, the following transformations are employed as <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>.
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover><mml:mi>G</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>The matrices <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>represent the transformation mappings from the camera frame of reference to the robot frame of reference and from the 2D image frame to the 3D space, respectively. The ensemble of all potential grasps within the image space is designated by <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref>:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mrow><mml:mover><mml:mi>Q</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mrow><mml:mi mathvariant="normal">&#x03A6;</mml:mi></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x211B;</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></disp-formula>where <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mrow><mml:mover><mml:mi>Q</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula>, <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mrow><mml:mover><mml:mrow><mml:mi mathvariant="normal">&#x03A6;</mml:mi></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> and <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mrow><mml:mover><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> are the results of three <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msup><mml:mrow><mml:mi>&#x211B;</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> pixel-level images representing the grasp quality, grasp angle, and grasp width, respectively.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Res-Swin-CNN Block (RSC Block)</title>
<p>The integration of CNN and Swin Transformer represents a novel approach in deep learning research [<xref ref-type="bibr" rid="ref-37">37</xref>]. As shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>, the module not only integrates the core components of CNN and Swin Transformer but also introduces attention modules and residual structures to enhance local feature extraction and establish long-range dependencies more effectively.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>RSC Module Architecture. Input tensor X is split into two branches: Conv-Res path (local detail) and Swin-Transformer path (global context); concatenated features are refined by CBAM and a 1 &#x00D7; 1 conv. Symbols: Conv &#x003D; convolution, BN &#x003D; batch normalization, ST &#x003D; Swin-Transformer, CBAM &#x003D; convolutional block attention module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-2.tif"/>
</fig>
<p>The input feature tensor <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mi>X</mml:mi></mml:math></inline-formula> is first processed by a 1 &#x00D7; 1 convolution, and then split uniformly into two feature maps <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>. These two feature mappings are then processed respectively by the convolution block (ConvBlock) and the residual block (ResBlock), as well as the swin-transformer block (STBlock) channel. The ConvBlock consists of two sequential 3 &#x00D7; 3 convolutional layers, each followed by a Batch Normalization (BN) layer and a ReLU activation function. The ResBlock is similar to the structure mentioned in reference [<xref ref-type="bibr" rid="ref-39">39</xref>], with an expansion factor of 4. The STBlock is a standard Swin Transformer Block [<xref ref-type="bibr" rid="ref-37">37</xref>] with a window size of 7 and 4 attention heads. Subsequently, the outputs of the two channels are concatenated and then processed by a second 1 &#x00D7; 1 convolution. Finally, the result is processed through a spatio-temporal attention residual connection. This process can be summarized by the following as <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mtable columnalign="left" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>Z</mml:mi><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Search Space Base Attention</title>
<p>Cell-based search methods typically comprise multiple normal cells stacked together, followed by a reduction cell. While this structure can enhance the feature representational capacity of the model, the accumulation of numerous redundant cells within the network results in feature redundancy, which affects the efficiency and performance of the model. To address this issue, we introduce an attention-based cell search space method.</p>
<p>As illustrated in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>, to enhance the performance of the model, the paper constructs a search space containing six types of attention cells, specifically SE, CBAM, EMA, TMHSA, SC2A, and TKSA. Each module demonstrates different advantages in specific tasks, making them highly adaptable. SE [<xref ref-type="bibr" rid="ref-40">40</xref>] proposed a streamlined attention mechanism that targets channel-level operations, improving the capability of the network to identify and prioritize important channels. In contrast, CBAM [<xref ref-type="bibr" rid="ref-41">41</xref>] combines spatial and channel attention mechanisms to enhance individual channel representations while capturing critical features across diverse spatial locations. While SE and CBAM show significant performance improvements in channel attention, they lack the interrelationship between channels and cannot strengthen feature information representation. In recent years, the Transformer architecture has achieved significant breakthroughs in natural language processing (NLP) and has been progressively adopted for computer vision applications [<xref ref-type="bibr" rid="ref-42">42</xref>]. As the core of Transformer, the self-attention mechanism can establish global dependencies and expands the receptive field, enabling it to capture more contextual information. Reference [<xref ref-type="bibr" rid="ref-43">43</xref>] proposes an effective DeRaining network, which includes the sparse transformer (DRSformer) that solves the redundancy problem in self-attention within Transformer by using the similarity between all query-key pairs for feature aggregation. Specifically, this network designs a learnable top-k selection operator-based attention mechanism (TKSA) that adaptively retains the most important attention scores for each query, thereby achieving better feature aggregation. Although the Transformer excels at global modeling, it still neglects variations in spatial information across channels. To more effectively capture cross-channel dependencies, researchers have proposed several solutions. EMA [<xref ref-type="bibr" rid="ref-44">44</xref>] integrates output features from two parallel sub-networks by learning across space without reducing the channel dimensions, capturing pixel-level pairwise relationships and avoiding the computational overhead of traditional methods.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Attention-based Cell Search Space Embedding Diagram. Six candidate attention cells (SE, CBAM, EMA, TMHSA, SC2A, TKSA) are randomly attached after normal cells during NAS</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-3.tif"/>
</fig>
<p>To efficiently identify the most effective attention cell for our task, we employ a Uniform Random Sampling strategy. Specifically, during the search phase, for each position designated for an attention cell, one module is randomly and independently selected from the pool of six candidates with equal probability. This process is repeated over multiple search iterations to sufficiently explore the search space. The total number of search iterations was set to 500. To accelerate the performance estimation of each sampled architecture, we trained it on a 10% random subset of the training data for 10 epochs and used the resulting validation accuracy as a proxy. Random search is computationally efficient and avoids the overhead associated with more complex search strategies like Reinforcement Learning or Evolutionary Algorithms. At the same time, it provides an unbiased exploration, preventing the search from being prematurely trapped by local optima, which is a potential risk in gradient-based methods. Random search has been shown to be a strong baseline in hyperparameter and architecture optimization [<xref ref-type="bibr" rid="ref-45">45</xref>], often competing favorably with more sophisticated algorithms while being significantly simpler to implement.</p>
<p>We employ a random search method to select an attention cell from this search space and insert it after the stack of normal cells. This method operates by sampling each candidate cell with equal probability during each search iteration. It not only captures more key features but also effectively decreases the impact of redundant characteristics, optimizing the feature extraction process.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Definition of the Initial Search Space</title>
<p>To design an efficient and compact network structure, the paper draws on the design principles of several classic neural network architectures when defining the initial search space. These classic network architectures balance network expressiveness and computational cost through different techniques, providing valuable references. ResNet [<xref ref-type="bibr" rid="ref-39">39</xref>] mitigates the issue of vanishing gradients in deep networks through the incorporation of skip connections, facilitating the development of deeper network structures. Additionally, GoogLeNet [<xref ref-type="bibr" rid="ref-46">46</xref>] improves network performance and efficiency by parallelizing convolution operations of different sizes. The key innovation of this model is its Inception module, which enables the network to adaptively determine the most effective convolution or pooling operation at different scales. MobileNet [<xref ref-type="bibr" rid="ref-47">47</xref>] replaces traditional convolutions with depthwise separable convolution, leading to a significant decrease in both parameters and computational cost. AlexNet [<xref ref-type="bibr" rid="ref-48">48</xref>] not only uses conventional convolution and pooling operations but also introduces the ReLU activation function and Batch Normalization (BN). The ReLU helps prevent the vanishing gradients problem during training, while BN significantly improves the generalization capability of the model. Based on the concepts of these models, the paper designed the search space to include 1 &#x00D7; 1 convolution, 3 &#x00D7; 3 convolution, 5 &#x00D7; 5 convolution, depthwise 3 &#x00D7; 3 convolution, ReLU activation function, BN, and MaxPooling for building the basic cells. The 1 &#x00D7; 1 convolutions enable channel fusion, enhancing the non-linear representational capacity of the network. At the same time, this paper constructed an initial search space comprising eight normal cells, six attention cells, and eight reduction cells, providing the search algorithm with sufficient flexibility and diversity to efficiently explore different network topologies. Normal cells maintain the spatial dimensions of feature maps, attention cells highlight salient information, and reduction cells reduce the feature map size by half. The entire network consists of 4 normal cells, 2 attention cells, and 2 reduction cells. The initial channel number for the network is set to 32. Each Reduction Cell doubles the channel count while halving the spatial dimensions of the feature maps. This configuration balances the network&#x2019;s depth and width, ensures sufficient representational power, and reduces computational cost. Furthermore, this categorization aids the model in capturing features across various scales, thereby improving its ability to adjust to changes in scale. By establishing a well-balanced initial search space, as depicted in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>, the paper lay the foundation for network searches aimed at identifying accurate and computationally efficient network structures in subsequent explorations.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Cell structure diagram of the defined initial search space. A-H are Normal cells, and I-P are Reduced cells</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-4a.tif"/>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-4b.tif"/>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>NAS-based Robot Grasp Detection Network Architecture</title>
<p>As illustrated in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>, we employ NAS to automatically design the encoder and decoder components of the robotic grasp detection network. Specifically, the proposed network takes the original image as input, extracts image features through the automatically designed network architecture and three RSC modules, and finally outputs three images representing grasp quality, grasp width, and grasp angle. The upsampling operation is implemented using bilinear interpolation. The three output heads are all composed of a single 1 &#x00D7; 1 convolution layer. Specifically, the quality head uses a sigmoid activation function to output values in [0, 1], while the angle and width heads use linear activations. This feature extraction strategy aims to combine the advantages of CNN in capturing detailed features and the Swin Transformer in focusing on global image features. As evidenced by the experimental results, this strategy demonstrates effective performance.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>NAS-based Robot Grasp Detection Network Architecture Diagram. Encoder: searched normal/attention/reduction cells &#x002B; three RSC blocks; decoder: three 1 &#x00D7; 1 heads output Q (quality), &#x0398; (angle), W (width). Up-sampling uses bilinear interpolation. Input: 224 &#x00D7; 224 RGB-D; output: 3 &#x00D7; 224 &#x00D7; 224 maps</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-5.tif"/>
</fig>
<p>Our lightweight network adapts to the target object through two main mechanisms. Firstly, attention-based units (such as SE, CBAM, TKSA) dynamically recalibrate the feature response based on the input, enabling the model to emphasize the channels and spatial regions relevant to the task. For instance, when grasping the deformable sponge, channel attention might amplify features related to texture and flexibility, while for rigid plastic boxes, spatial attention might focus on the edges and corners. Secondly, the multi-scale feature extraction achieved through the combination of Normal, Reduction, and RSC modules enables the network to capture local geometric details and global context information. This is particularly important for dealing with objects of different sizes and shapes.</p>
<p>On this basis, we further emphasize the three adaptability aspects introduced by the NAS framework:
<list list-type="order">
<list-item>
<p>Structural adaptability: The NAS algorithm automatically explores the combination of CNN, Swin Transformer, and attention mechanisms, enabling the network to adjust its topological structure based on object features. This enables the model to prioritize local details (such as edges, textures) or global structures (such as shapes, postures) based on the input.</p></list-item>
<list-item>
<p>Feature adaptability: The RSC module integrates spatial and channel attention mechanisms, enabling dynamic re-weighting of the feature map based on the cues of specific objects. This ensures that the model focuses on the most informative areas (such as the graspable parts), while suppressing irrelevant backgrounds or redundant features.</p></list-item>
<list-item>
<p>Input modal flexibility: This network supports RGB and RGB-D inputs and can adaptively fuse color and depth information based on object attributes. For instance, for low-texture or reflective objects, depth data might be given more emphasis, while for objects with rich textures or colors, RGB data might be more reliable.</p></list-item>
</list></p>
<p>In summary, through the synergistic effect of attention-based dynamic calibration, multi-scale feature extraction, and NAS-driven structure search, the proposed network achieves strong adaptability to various objects and can perform robust and accurate grasping and detection under different shapes, materials, and appearances.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental Setup</title>
<p>The experiments of the paper are conducted on the Windows 10 operating system, which utilized the PyTorch 1.12 deep learning framework with CUDA 11.6. All experiments were performed on a system equipped with an RTX 4090 GPU and a 13th Gen Intel (R) Core (TM) i9-13900K processor. We trained on the public Cornell and Jacquard datasets, using 90% of the samples for training and 10% for testing. To thoroughly assess model performance, we adopted two data splitting strategies: image-wise (IW) and object-wise (OW) splits. IW strategy involves a random split of the dataset, which allows instances of the same objects to be included in both training and evaluation datasets. On the other hand, the OW strategy partitions the dataset by object instances to prevent any overlap between objects in the training and test sets. These two approaches evaluate the network&#x2019;s generalization to novel positions of seen objects (IW) and its adaptability to entirely unseen objects (OW). Training was performed using the Adam optimizer with an initial learning rate of 0.01, a batch size of 16, and for 100 epochs. We employed a multi-step learning rate scheduler that reduced the rate by a factor of 0.1 every five epochs to enhance training efficiency and stability.</p>
<p>For the preprocessing of image input, we mainly adjust the RGB image to a size of 224 &#x00D7; 224, normalize the depth map to the range of [0, 1], and use a cross-bilateral filter for restoration. For the model parameters, we set the initial number of channels to 32, the number of cell to 4 normal cell &#x002B; 2 attention cell &#x002B; 2 reduction cell, the batch size to 16, and use Adam as the optimizer. Here, the evaluation metrics we use include mean Intersection over Union (mIoU), grasping accuracy, detection speed (fps), and model size (MB). All contrast models (GG-CNN, GR-ConvNet, SE-ResUNet, DSNet, etc.) were retrained under the same data partitioning and input resolution for fair comparison.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Dataset</title>
<p><xref ref-type="table" rid="table-1">Table 1</xref> presents the two datasets utilized for the training and evaluation of the proposed model. The first dataset is the Cornell Grasping dataset [<xref ref-type="bibr" rid="ref-49">49</xref>], which is renowned for its widespread utilization, comprising 885 RGB-D images and 240 distinct objects. The second is the recently released Jacquard Grasping Dataset [<xref ref-type="bibr" rid="ref-50">50</xref>], which includes 54 K RGB-D images and 11 K objects.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Robot grasping datasets</title>
</caption>
<table>
<colgroup>
<col align="center" width="25mm"/>
<col align="center" width="25mm"/>
<col align="center" width="24mm"/>
<col align="center" width="25mm"/> </colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Objects</th>
<th>Images</th>
<th>Grasps</th>
</tr>
</thead>
<tbody>
<tr>
<td>Cornell</td>
<td>240</td>
<td>885</td>
<td>8019</td>
</tr>
<tr>
<td>Jacquard</td>
<td>11 K</td>
<td>54 K</td>
<td>1.1 K</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To alleviate overfitting of the smaller Cornell dataset, we applied data augmentation, including random rotation, random shrinkage, and random clipping. For the Jacquard dataset, standard normalization is adopted.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Metrics</title>
<p>The test model in this paper is evaluated using the following metrics: Intersection over Union (IoU), detection speed, and the number of parameters. The definition of IoU is as follows <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mi>I</mml:mi><mml:mi>O</mml:mi><mml:mi>U</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>&#x222A;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>We denote the grasp generated by the network as <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and the ground-truth grasp as <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. To provide a fair comparison, model performance is assessed using the rectangular metric proposed by Jiang et al. [<xref ref-type="bibr" rid="ref-49">49</xref>], where a detection is regarded as valid when both specified conditions are met:
<list list-type="simple">
<list-item><label>(1)</label>
<p>The angular difference between the grasp predicted by the network and the ground-truth grasp must be less than 30&#x00B0;;</p></list-item>
<list-item><label>(2)</label>
<p>The IoU between the predicted and ground-truth bounding boxes must exceed 25%.</p></list-item>
</list></p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Loss Function</title>
<p>For loss calculation, this study uses the Smooth L1 Loss, which is also termed Huber loss, to quantify the disparity between the predicted confidence in heat map generation and the confidence in the ground truth. The following expression defines the loss function, where <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> signifies the grasp prediction generated by the network, and <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> corresponds to the true grasp.
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>L</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:mover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:mover><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Z</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula>
<disp-formula id="ueqn-10"><mml:math id="mml-ueqn-10" display="block"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Z</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mn>0.5</mml:mn><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mo>&#x003C;</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>Y</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Experimental Results</title>
<sec id="s4_5_1">
<label>4.5.1</label>
<title>Results of Neural Architecture Search</title>
<p>For the search phase, the total number of search iterations was set to 200, and for each search, we selected one Cell from the normal cell space, one from the attention cell space, and one from the reduction cell space, using the cell stacking method shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>. To expedite the search and enhance efficiency, the effectiveness probability for each cell was evaluated. The effectiveness probability refers to the frequency with which a cell appears in the best-performing networks. Here, we calculated the usage frequency of each Cell from the top 50 best-performing networks and used this to determine their relative usage proportions in all generated networks. This approach infers the selection probability of each cell; the resulting probabilities are presented in <xref ref-type="table" rid="table-2">Table 2</xref>.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Effectiveness probabilities of cells in different search spaces</title>
</caption>
<table>
<colgroup>
<col align="center" width="40mm"/>
<col align="center" width="13mm"/>
<col align="center" width="11mm"/>
<col align="center" width="9mm"/>
<col align="center" width="11mm"/>
<col align="center" width="10mm"/>
<col align="center" width="10mm"/>
<col align="center" width="10mm"/>
<col align="center" width="10mm"/> </colgroup>
<tbody>
<tr>
<td rowspan="2"><bold>Normal Search Space-1</bold></td>
<td><bold>A</bold></td>
<td><bold>B</bold></td>
<td><bold>C</bold></td>
<td><bold>D</bold></td>
<td><bold>E</bold></td>
<td><bold>F</bold></td>
<td><bold>G</bold></td>
<td><bold>H</bold></td>
</tr>
<tr>
<td>0.161</td>
<td>0.181</td>
<td>0.167</td>
<td>0.216</td>
<td>0.205</td>
<td>0.183</td>
<td>0.124</td>
<td>0.375</td>
</tr>
<tr>
<td rowspan="2"><bold>Normal Search Space-2</bold></td>
<td><bold>A</bold></td>
<td><bold>B</bold></td>
<td><bold>C</bold></td>
<td><bold>D</bold></td>
<td><bold>E</bold></td>
<td><bold>F</bold></td>
<td><bold>G</bold></td>
<td><bold>H</bold></td>
</tr>
<tr>
<td>0.241</td>
<td>0.085</td>
<td>0.176</td>
<td>0.210</td>
<td>0.182</td>
<td>0.174</td>
<td>0.202</td>
<td>0.227</td>
</tr>
<tr>
<td rowspan="2"><bold>Attention Search Space</bold></td>
<td><bold>SE</bold></td>
<td><bold>CBAM</bold></td>
<td><bold>EMA</bold></td>
<td><bold>TMGSA</bold></td>
<td><bold>SC2A</bold></td>
<td><bold>TKSA</bold></td>
<td><bold>&#x2013;</bold></td>
<td><bold>&#x2013;</bold></td>
</tr>
<tr>
<td>0.167</td>
<td>0.192</td>
<td>0.075</td>
<td>0.053</td>
<td>0.054</td>
<td>0.108</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td rowspan="2"><bold>Reduce Search Space</bold></td>
<td><bold>I</bold></td>
<td><bold>J</bold></td>
<td><bold>K</bold></td>
<td><bold>L</bold></td>
<td><bold>M</bold></td>
<td><bold>N</bold></td>
<td><bold>O</bold></td>
<td><bold>P</bold></td>
</tr>
<tr>
<td>0.147</td>
<td>0.194</td>
<td>0.316</td>
<td>0.255</td>
<td>0.178</td>
<td>0.045</td>
<td>0.212</td>
<td>0.174</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Specifically, the confidence scores for cells H, D, E, and B in the first two layers of Normal cells are 0.375, 0.216, 0.205, and 0.181, respectively. In the last two layers, cells A, H, D, and G demonstrate robust performance, achieving probabilities of 0.241, 0.227, 0.210, and 0.202, respectively. For the Attention cell, CBAM, SE, and TKSA modules exhibited effectiveness probabilities of 0.192, 0.167, and 0.108, highlighting their strong information selection flexibility. For the Reduce cell, cells K, L, O, and J show effectiveness probabilities of 0.316, 0.255, 0.212, and 0.194, demonstrating their importance in the network. Using these data, we selected the top 4 cells with the highest effectiveness probabilities in the Normal and Reduce cell search spaces, and the top 3 in the Attention cell search space, to form the optimized search space. Then, the 5 structures with the best overall performance were selected based on their effectiveness probabilities, and their performance was compared in detail. The comparison of performance is provided in <xref ref-type="table" rid="table-3">Table 3</xref>.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Robot grasping datasets</title>
</caption>
<table>
<colgroup>
<col align="center" width="7mm"/>
<col align="center" width="55mm"/>
<col align="center" width="23mm"/>
<col align="center" width="28mm"/> </colgroup>
<thead>
<tr>
<th>No.</th>
<th>Network structure composition</th>
<th>Accuracy (%)</th>
<th>Params (M)</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>D, B, SE, K, H, G, CBAM, O</td>
<td>97.2</td>
<td>0.44</td>
</tr>
<tr>
<td>2</td>
<td>B, H, SE, L, A, H, SE, K</td>
<td>98.3</td>
<td>0.41</td>
</tr>
<tr>
<td>3</td>
<td>H, H, CBAM, J, H, D, SE, L</td>
<td>98.6</td>
<td>0.68</td>
</tr>
<tr>
<td>4</td>
<td>E, D, TKSA, K, D, A, SE, K</td>
<td>96.5</td>
<td>0.55</td>
</tr>
<tr>
<td>5</td>
<td>H, D, TKSA, K, A, H, CBAM, L</td>
<td>96.6</td>
<td>0.32</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="table-3">Table 3</xref> shows that the 4th structure achieves the highest accuracy of 98.6%; however, it also exhibits the largest parameter count at 0.68 M. Although the 5th structure has the smallest number of parameters, its accuracy does not reach the optimal level. In contrast, the 2nd structure achieves an accuracy close to the best performance while maintaining a relatively small parameter size.</p>

</sec>
<sec id="s4_5_2">
<label>4.5.2</label>
<title>Results on the Cornell and Jacquard Dataset</title>
<p>All experimental results reported in this section are the mean &#x00B1; standard (Mean &#x00B1; Std.) deviation of three independent training and testing runs with different random seeds. This provides a measure of the variability and stability of our model&#x2019;s performance.</p>
<p>Utilizing the sequence (B, H, SE, L, A, H, SE, K), we establish a detection network for capturing poses of the target. The efficacy of this network is assessed using the Cornell and Jacquard datasets. The feasibility of the proposed method was demonstrated by assessing the performance of the network across different object types and input modalities. The input modalities included unimodal inputs, such as depth (D) only and RGB only, as well as multimodal inputs, namely RGB-D. As indicated in <xref ref-type="table" rid="table-1">Table 1</xref>, the Cornell dataset comprises substantially fewer samples than the Jacquard dataset. To mitigate potential overfitting, we applied data augmentation strategies on the Cornell dataset, including random rotation, scaling, and cropping. We used a cross-validation approach to comprehensively assess the model&#x2019;s validity on the Cornell dataset. This approach divides the dataset into different subsets to enable repeated training and testing across these subsets.</p>

<p><xref ref-type="table" rid="table-4">Table 4</xref> presents the performance comparison of our system with other robotic grasp detection methods on the Cornell dataset under various input modalities. When using RGB-D inputs, our system achieved an image-level segmentation accuracy of 98.6% and an object-level segmentation accuracy of 95.8%, outperforming all other grasp detection methods listed in <xref ref-type="table" rid="table-4">Table 4</xref>. Furthermore, it achieved a detection speed of 89 fps with only 0.41 M parameters, surpassing most existing grasp detection approaches. As shown in <xref ref-type="table" rid="table-4">Tables 4</xref> and <xref ref-type="table" rid="table-5">5</xref>, our search network demonstrates superior performance with multimodal data compared to unimodal data. <xref ref-type="fig" rid="fig-6">Figs. 6</xref> and <xref ref-type="fig" rid="fig-7">7</xref> illustrate the qualitative results on the Cornell and Jacquard datasets, respectively.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Accuracy on the cornell dataset (Mean &#x00B1; Std., %)</title>
</caption>
<table>
<colgroup>
<col align="center" width="40mm"/>
<col align="center" width="25mm"/>
<col align="center" width="17mm"/>
<col align="center" width="16mm"/>
<col align="center" width="19mm"/> </colgroup>
<thead>
<tr>
<th align="center" rowspan="2">Algorithm</th>
<th colspan="2">Accuracy (%)</th>
<th rowspan="2">Speed (fps)</th>
<th rowspan="2">Parameter (M)</th>
</tr>
<tr>
<th>IW</th>
<th>OW</th>
</tr>
</thead>
<tbody>
<tr>
<td>Fast Search [<xref ref-type="bibr" rid="ref-49">49</xref>]</td>
<td>60.5</td>
<td>58.3</td>
<td>5000</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>AlexNet [<xref ref-type="bibr" rid="ref-48">48</xref>]</td>
<td>88.0 &#x00B1; 0.1</td>
<td>87.1 &#x00B1; 0.3</td>
<td>76</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>ResNet-50 &#x00D7; 2 [<xref ref-type="bibr" rid="ref-52">52</xref>]</td>
<td>89.2</td>
<td>88.9</td>
<td>103</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>GG-CNN [<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>73.0</td>
<td>69.0</td>
<td>19</td>
<td>0.07</td>
</tr>
<tr>
<td>ZF-Net [<xref ref-type="bibr" rid="ref-53">53</xref>]</td>
<td>93.2 &#x00B1; 0.2</td>
<td>89.1 &#x00B1; 0.3</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>GraspNet [<xref ref-type="bibr" rid="ref-54">54</xref>]</td>
<td>90.2</td>
<td>90.6</td>
<td>24</td>
<td>3.71</td>
</tr>
<tr>
<td>GRPN [<xref ref-type="bibr" rid="ref-55">55</xref>]</td>
<td>88.7</td>
<td>&#x2013;</td>
<td>200</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>SE-ResUNet [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>94.1 &#x00B1; 0.1</td>
<td>96.5 &#x00B1; 0.2</td>
<td>&#x2013;</td>
<td>0.84</td>
</tr>
<tr>
<td>DSNet [<xref ref-type="bibr" rid="ref-51">51</xref>]</td>
<td>98.3 &#x00B1; 0.2</td>
<td>97.1 &#x00B1; 0.3</td>
<td>73</td>
<td>61.19</td>
</tr>
<tr>
<td>GraspFormer</td>
<td>98.91 &#x00B1; 0.1</td>
<td>97.3 &#x00B1; 0.2</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>HTC-Grasp</td>
<td>&#x2013;</td>
<td>96.9</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>GR-ConvNet [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>&#x2013;</td>
<td>96.6 &#x00B1; 0.1</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>TF-Grasp [<xref ref-type="bibr" rid="ref-56">56</xref>]</td>
<td>&#x2013;</td>
<td>96.7</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>NAS-RGD(D)</td>
<td>91.5 &#x00B1; 0.1</td>
<td>93.1 &#x00B1; 0.2</td>
<td rowspan="3">84</td>
<td rowspan="3">0.41</td>
</tr>
<tr>
<td>NAS-RGD(RGB)</td>
<td>97.6 &#x00B1; 0.1</td>
<td>97.4 &#x00B1; 0.2</td>
</tr>
<tr>
<td><bold>NAS-RGD(RGB-D)</bold></td>
<td><bold>99.3 &#x00B1; 0.1</bold></td>
<td><bold>97.8 &#x00B1; 0.2</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-4fn1" fn-type="other">
<p>Note: IW &#x003D; image-wise split; OW &#x003D; object-wise split; fps &#x003D; frames per second; M &#x003D; million; RGB-D &#x003D; RGB &#x002B; depth; D &#x003D; depth only. Bold indicates best result.</p>
</fn>
</table-wrap-foot>
</table-wrap><table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Accuracy on the jacquard dataset</title>
</caption>
<table>
<colgroup>
<col align="center" width="47mm"/>
<col align="center" width="53mm"/> </colgroup>
<thead>
<tr>
<th>Algorithm</th>
<th>Accuracy (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Jacquard [<xref ref-type="bibr" rid="ref-50">50</xref>]</td>
<td>74.2</td>
</tr>
<tr>
<td>GG-CNN [<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>84.0</td>
</tr>
<tr>
<td>FGGN, ResNet-101 [<xref ref-type="bibr" rid="ref-57">57</xref>]</td>
<td>91.8</td>
</tr>
<tr>
<td>GR-ConvNet [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>94.6</td>
</tr>
<tr>
<td>TF-Grasp [<xref ref-type="bibr" rid="ref-56">56</xref>]</td>
<td>94.6</td>
</tr>
<tr>
<td>SE-ResUNet [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>94.9</td>
</tr>
<tr>
<td>DSNet [<xref ref-type="bibr" rid="ref-51">51</xref>]</td>
<td>95.7</td>
</tr>
<tr>
<td>NAS-RGD (D)</td>
<td>83.8</td>
</tr>
<tr>
<td>NAS-RGD (RGB)</td>
<td>92.3</td>
</tr>
<tr>
<td><bold>NAS-RGD (RGB-D)</bold></td>
<td><bold>96.8</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-5fn1" fn-type="other">
<p>Note: RGB-D &#x003D; RGB &#x002B; depth; D &#x003D; depth only. Bold indicates best result.</p>
</fn>
</table-wrap-foot>
</table-wrap><fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Qualitative grasp detection results on Cornell dataset. (<bold>a</bold>) Input RGB-D. (<bold>b</bold>) Ground-truth grasp rectangle. (<bold>c</bold>) Predicted grasp (IoU &#x003D; 0.92). (<bold>d</bold>) CAM heatmap overlaid on input. Red regions denote high activation. Best viewed in color and 300 dpi resolution</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-6.tif"/>
</fig><fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Figures (<bold>a</bold>,<bold>b</bold>) display the results on the Jacquard dataset. Left to right: RGB, depth, predicted grasp, CAM overlay</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-7.tif"/>
</fig>
<p>A paired t-test was conducted between our method and the strongest baseline (DSNet [<xref ref-type="bibr" rid="ref-51">51</xref>]) on the OW split accuracy over three runs. The result (<italic>p</italic> &#x003C; 0.05) indicates that the performance improvement of our method is statistically significant.</p>
</sec>
<sec id="s4_5_3">
<label>4.5.3</label>
<title>Ablation Studies</title>
<p>To assess the contributions of the RSC module and the hybrid network architecture, we performed a series of ablation experiments. We evaluated the performance on the Cornell dataset under four different conditions: constructing the RSC module using only CNNs, constructing the RSC module using only the Swin Transformer, removing all RSC modules, and retaining the original structure. All experiments were conducted under consistent conditions, using RGB-D inputs. As shown in <xref ref-type="table" rid="table-5">Table 5</xref>, the model exhibited the worst performance when the RSC module was removed. This is because the RSC module leverages the complementary advantages of CNNs and the Swin Transformer, enabling effective extraction of local features while enhancing global contextual correlations, thus improving the model&#x2019;s representation capability. Furthermore, when the RSC module was constructed using only CNNs or only the Swin Transformer, the model performance was moderate. However, when both components were combined, the model achieved the best performance.</p>

</sec>
<sec id="s4_5_4">
<label>4.5.4</label>
<title>Real World Grasping Experiments</title>
<p>Beyond attaining high performance on two benchmark datasets, we further confirmed the efficacy of our system through real-world robotic grasping experiments. In the experiments, we employed a UR5 robotic arm and adopted an &#x201C;eye-to-hand&#x201D; configuration. The estimated grasping poses, including orientation information and grasp width, were transmitted to the robotic control center. Upon receiving the commands, the robotic arm first moved to a position 25 cm above the target object with the two-finger gripper fully open. It then adjusted the gripper&#x2019;s orientation according to the estimated grasp pose, gradually approached the object, enclosed it within the gripper&#x2019;s range, and progressively closed the gripper to complete the grasp. We conducted tests using 10 target objects and 3 distractor objects, each placed in 10 different positions and orientations. A total of 100 grasp attempts were made on the target objects, resulting in 96 successful grasps, corresponding to an accuracy of 96.0%. For the distractor objects, 30 grasp attempts were made with 28 successes, achieving an accuracy of 93.3%. <xref ref-type="table" rid="table-6">Table 6</xref> presents the comparative results between our method and other deep learning-based approaches in robotic grasping.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Accuracy of ablation studies on the cornell dataset</title>
</caption>
<table>
<colgroup>
<col align="center" width="65mm"/>
<col align="center" width="28mm"/>
<col align="center" width="28mm"/> </colgroup>
<thead>
<tr>
<th align="center" rowspan="2">Ablation Network</th>
<th colspan="2">Accuracy (%)</th>
</tr>
<tr>
<th>IW</th>
<th>OW</th>
</tr>
</thead>
<tbody>
<tr>
<td>RSC Block (Only CNN)</td>
<td>92.5</td>
<td>90.8</td>
</tr>
<tr>
<td>RSC Block (Only Swin Transformer)</td>
<td>95.4</td>
<td>93.7</td>
</tr>
<tr>
<td>Without RSC Block</td>
<td>82.8</td>
<td>81.5</td>
</tr>
<tr>
<td>Ours</td>
<td>98.6</td>
<td>95.4</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, to assess the system&#x2019;s generalization, we performed grasping tests on previously unseen objects. In this study, five categories of low-stiffness objects were selected, including sponges, bubbles, tissues, plastic boxes, and paper cores. These objects exhibit deformable characteristics, maintaining their shape only within a specific range of gripping force; exceeding a certain threshold results in deformation and eventual damage. Grasping tests were conducted at 10 different positions and orientations. The experimental results are summarized in <xref ref-type="table" rid="table-7">Table 7</xref>. The standardized comparison in <xref ref-type="table" rid="table-7">Table 7</xref> demonstrates that our method achieves the highest success rate on target objects (96.0%) among all compared methods, while also maintaining a high success rate on adversarial objects (93.3%). This validates the robustness and practical superiority of our searched model in real-world physical grasping scenarios. <xref ref-type="fig" rid="fig-8">Fig. 8</xref> illustrates the scenarios of the UR5 robotic arm grasping low-stiffness objects, and <xref ref-type="fig" rid="fig-9">Fig. 9</xref> presents the confusion matrix for the five types of low-stiffness objects.</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Real world results of grasping experiments</title>
</caption>
<table>
<colgroup>
<col align="center" width="35mm"/>
<col align="center" width="45mm"/>
<col align="center" width="65mm"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>Target objects accuracy (%)</th>
<th>Adversarial objects accuracy (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td>SAE [<xref ref-type="bibr" rid="ref-22">22</xref>]</td>
<td>89.0</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>GG-CNN [<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>83.5</td>
<td>83.8</td>
</tr>
<tr>
<td>GR-ConvNet [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>86.0</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>SE-ResUNet [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>94.5</td>
<td>92.0</td>
</tr>
<tr>
<td>DSNet [<xref ref-type="bibr" rid="ref-51">51</xref>]</td>
<td>93.5</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>Ours</td>
<td>96.0</td>
<td>93.3</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-7fn1" fn-type="other">
<p>Note: To ensure a fair comparison, the results for all methods have been standardized and represent the grasp success rate percentage. The data for comparison methods are derived from their original publications (e.g., SE-ResUNet [<xref ref-type="bibr" rid="ref-31">31</xref>] reported 189/200 and 92/100, which equate to 94.5% and 92.0%, respectively). Our results are based on 100 attempts for target objects and 30 for adversarial objects.</p>
</fn>
</table-wrap-foot>
</table-wrap><fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>UR5 robotic arm grasping scenarios with low-stiffness objects. Left to right: RGB, depth, predicted grasp, CAM overlay</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-8.tif"/>
</fig><fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Confusion Matrix of the Five Types of Low-Stiffness Objects. Cell values &#x003D; success count/total trials (10 per object). Color bar indicates accuracy (%)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-9.tif"/>
</fig>
</sec>
<sec id="s4_5_5">
<label>4.5.5</label>
<title>Interpretability Analysis</title>
<p>As previously stated, our search model demonstrates cutting-edge performance in accuracy and parametric efficiency, which is substantiated by practical robot grasping trials. However, in the field of computer vision, the internal operation of models often resembles a black box, with obscure behavioral logic. To accurately characterize the model&#x2019;s feature representations and gain deeper insights into its internal mechanisms, we employed Class Activation Mapping (CAM) for in-depth analysis. This method intuitively highlights the regions where the model focuses during feature representation by generating heatmaps, thereby reflecting the model&#x2019;s sensitivity to different object classes. Such analysis is crucial for enhancing model credibility, optimizing network architecture, and strengthening the reliability of practical applications. <xref ref-type="fig" rid="fig-10">Fig. 10</xref> shows the CAM results corresponding to the highly effective cells for the five types of low-stiffness objects. Darker red regions in the heatmaps indicate higher activation values, implying greater contributions from those regions to the network&#x2019;s decision-making. It can be observed that the cells H, K, and SE exhibit more precise and concentrated receptive regions, indirectly explaining why these cells have the highest effectiveness probabilities and why the overall model achieves higher accuracy with fewer parameters. In contrast, for one of the low-stiffness objects, the tissue, there is evidence of insufficient feature perception in some cells, which helps explain the relatively lower grasping success rate observed during real-world experiments.</p>
<fig id="fig-10">
<label>Figure 10</label>
<caption>
<title>CAMs of Highly Effective Cells Corresponding to the Five Types of Low-Stiffness Objects. Hot color map: dark-red &#x003D; highest contribution. Dashed circles highlight over- or under-activated regions that correlate with lower success</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-10.tif"/>
</fig>
<p>We designate the capture detection network developed using the architecture (B, H, SE, L, A, H, SE, K) as model II and generate its Class Activation Maps (CAMs) across various target categories. As shown in <xref ref-type="fig" rid="fig-11">Fig. 11</xref>, Model II continues to focus on the sensitive regions of various objects, further validating the high effectiveness and credibility of our model.</p>
<fig id="fig-11">
<label>Figure 11</label>
<caption>
<title>CAMs of Model II on multiple object categories</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73442-fig-11.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Discussion and Limitation</title>
<p>Although the NAS framework proposed in this paper has achieved excellent performance on the Cornell and Jacquard datasets, it also has certain limitations. For instance, the recently proposed GoalGrasp framework [<xref ref-type="bibr" rid="ref-27">27</xref>] addresses the issue of grasping in partially occluded scenarios by leveraging three-dimensional spatial relationships. However, the method proposed in this paper relies on training data and may fail due to feature loss caused by occlusion.</p>
<p>Furthermore, Although the structure of NAS search performs well within the training distribution, the success rate of grasping decreases when dealing with new categories that differ significantly from the training data, such as flexible objects and transparent objects, for details, please refer to <xref ref-type="table" rid="table-8">Table 8</xref>. Although the final model is lightweight, the NAS search stage still requires hundreds of trainings and evaluations, taking approximately 48 GPU-hours, which is not conducive to rapid deployment to new scenarios.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Real World Results of Grasping Experiments.</title>
</caption>
<table>
<colgroup>
<col align="center" width="33mm"/>
<col align="center" width="33mm"/>
<col align="center" width="33mm"/> </colgroup>
<thead>
<tr>
<th>Objects</th>
<th>Target Objects</th>
<th>Accuracy (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Sponges</td>
<td>44/50</td>
<td>88.0</td>
</tr>
<tr>
<td>Bubble</td>
<td>43/50</td>
<td>86.0</td>
</tr>
<tr>
<td>Tissue</td>
<td>42/50</td>
<td>84.0</td>
</tr>
<tr>
<td>Plastic Box</td>
<td>45/50</td>
<td>90.0</td>
</tr>
<tr>
<td>Paper Core</td>
<td>44/50</td>
<td>88.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In future work, we will attempt to introduce unsupervised or self-supervised pre-training to enhance the generalization ability for unknown objects. At the same time, we will also use fine-grained interpretability tools such as (Attention Rollout, Integrated Gradients) to replace CAM and improve the interpretation accuracy.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>In this study, we introduce a NAS-based method for adaptive robotic grasp detection. This method automatically designs a high-accuracy and lightweight robotic grasping detection network, effectively improving the success rate of robotic grasping while significantly reducing time costs. Our searched network achieved testing accuracies of 98.3% and 95.8% on the Cornell and Jacquard public datasets, respectively, with a detection rate of 89 fps and only 0.41 M parameters. By maintaining high accuracy (e.g., 98.6% on Cornell IW split) with substantially fewer parameters (0.41 M), our method demonstrates a favorable balance between performance and efficiency compared to existing robotic grasping detection methods. We further evaluated our model through real-world grasping experiments using the UR5 robotic arm, demonstrating its validity and robustness. Additionally, we utilize a CAM-based interpretability analysis to reveal the internal working mechanisms of the neural network, greatly enhancing the credibility of our searched model. In future work, we plan to extend and optimize the proposed NAS approach to facilitate more efficient discovery of high-performance network architectures and to apply it to increasingly complex scenarios.</p>
</sec>
</body>
<back>
<ack>
<p>Not applicable.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This research was funded by Guangdong Basic and Applied Basic Research Foundation (2023B1515120064) and National Natural Science Foundation of China (62273097).</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>The authors confirm contribution to the paper as follows: Conceptualization, Lu Rong and Manyu Xu; Methodology, Lu Rong, Manyu Xu and Wenbo Zhu; Software, Lu Rong and Zhihao Yang; Validation, Lu Rong, Manyu Xu, and Chao Dong; Formal analysis, Manyu Xu and Yunzhi Zhang; Investigation, Lu Rong and Zhihao Yang; Resources, Wenbo Zhu and Bing Zheng; Data curation, Lu Rong and Zhihao Yang; Writing&#x2014;original draft preparation, Lu Rong and Manyu Xu; Writing&#x2014;review and editing, Wenbo Zhu, Kai Wang, and Bing Zheng; Visualization, Zhihao Yang and Yunzhi Zhang; Supervision, Wenbo Zhu and Bing Zheng; Project administration, Wenbo Zhu and Chao Dong; Funding acquisition, Wenbo Zhu. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>Data available on request from the authors. The Cornell Grasping Dataset used in this study is publicly available at <ext-link ext-link-type="uri" xlink:href="https://www.selectdataset.com/dataset/e0a5b83251e7bc6fdf7bdafd538aa2eb">https://www.selectdataset.com/dataset/e0a5b83251e7bc6fdf7bdafd538aa2eb</ext-link> (accessed on 28 July 2025). The Jacquard Grasping Dataset is publicly available at <ext-link ext-link-type="uri" xlink:href="https://jacquard.liris.cnrs.fr/">https://jacquard.liris.cnrs.fr/</ext-link> (accessed on 28 July 2025). The specific code and model weights generated during this study are available from the corresponding author, Wenbo Zhu, upon reasonable request.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>All real-world robotic experiments were conducted in a controlled laboratory environment using commercially available objects. No human or animal subjects were involved, and thus no ethical approval was required.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Elsken</surname> <given-names>T</given-names></string-name>, <string-name><surname>Metzen</surname> <given-names>JH</given-names></string-name>, <string-name><surname>Hutter</surname> <given-names>F</given-names></string-name></person-group>. <chapter-title>Neural architecture search</chapter-title>. In: <source>Automated machine learning</source>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2019</year>. p. <fpage>63</fpage>&#x2013;<lpage>77</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-030-05318-5_3</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jin</surname> <given-names>C</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>T</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Neural architecture search based on dual attention mechanism for image classification</article-title>. <source>Math Biosci Eng</source>. <year>2023</year>;<volume>20</volume>(<issue>2</issue>):<fpage>2691</fpage>&#x2013;<lpage>715</lpage>. doi:<pub-id pub-id-type="doi">10.3934/mbe.2023126</pub-id>; <pub-id pub-id-type="pmid">36899553</pub-id></mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Zoph</surname> <given-names>B</given-names></string-name>, <string-name><surname>Le</surname> <given-names>QV</given-names></string-name></person-group>. <article-title>Neural architecture search with reinforcement learning</article-title>. <comment>arXiv:161101578. 2016</comment>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kuang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Shi</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>X</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>M-FasterSeg: an efficient semantic segmentation network based on neural architecture search</article-title>. <source>Eng Appl Artif Intell</source>. <year>2022</year>;<volume>113</volume>(<issue>4</issue>):<fpage>104962</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.engappai.2022.104962</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Rong</surname> <given-names>X</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>K</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>UL-UNAS: ultra-lightweight U-nets for real-time speech enhancement via network architecture search</article-title>. <comment>arXiv: 2503.00340. 2025.</comment>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Vafeiadis</surname> <given-names>A</given-names></string-name>, <string-name><surname>van de Waterlaat</surname> <given-names>N</given-names></string-name>, <string-name><surname>Castel</surname> <given-names>C</given-names></string-name>, <string-name><surname>Defraene</surname> <given-names>B</given-names></string-name>, <string-name><surname>Daalderop</surname> <given-names>G</given-names></string-name>, <string-name><surname>Vogel</surname> <given-names>S</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Ultra-low memory speech denoising using quantization-aware neural architecture search</article-title>. In: <conf-name>2024 IEEE 34th International Workshop on Machine Learning for Signal Processing (MLSP 2024); 2024 Sep 22&#x2013;25; London, UK</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2024</year>. p. <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:<pub-id pub-id-type="doi">10.1109/mlsp58920.2024.10734824</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lee</surname> <given-names>JH</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>JH</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>JM</given-names></string-name>, <string-name><surname>Moon</surname> <given-names>HG</given-names></string-name></person-group>. <article-title>NAS-TasNet: neural architecture search for time-domain speech separation</article-title>. <source>IEEE Access</source>. <year>2022</year>;<volume>10</volume>:<fpage>56031</fpage>&#x2013;<lpage>43</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2022.3176003</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Klyuchnikov</surname> <given-names>N</given-names></string-name>, <string-name><surname>Trofimov</surname> <given-names>I</given-names></string-name>, <string-name><surname>Artemova</surname> <given-names>E</given-names></string-name>, <string-name><surname>Salnikov</surname> <given-names>M</given-names></string-name>, <string-name><surname>Fedorov</surname> <given-names>M</given-names></string-name>, <string-name><surname>Filippov</surname> <given-names>A</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>NAS-bench-NLP: neural architecture search benchmark for natural language processing</article-title>. <source>IEEE Access</source>. <year>2022</year>;<volume>10</volume>(<issue>243</issue>):<fpage>45736</fpage>&#x2013;<lpage>47</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2022.3169897</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wan</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Dual-cell differentiable architecture search for language modeling</article-title>. <source>J Intell Fuzzy Syst</source>. <year>2021</year>;<volume>41</volume>(<issue>2</issue>):<fpage>3985</fpage>&#x2013;<lpage>92</lpage>. doi:<pub-id pub-id-type="doi">10.3233/jifs-210207</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>C</given-names></string-name>, <string-name><surname>Fan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>M</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>DCNN search and accelerator co-design: improve the adaptability between NAS frameworks and embedded platforms</article-title>. <source>Integration</source>. <year>2022</year>;<volume>87</volume>(<issue>2</issue>):<fpage>147</fpage>&#x2013;<lpage>57</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.vlsi.2022.07.003</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lee</surname> <given-names>J</given-names></string-name>, <string-name><surname>Rhim</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Ha</surname> <given-names>S</given-names></string-name></person-group>. <article-title>SNAS: fast hardware-aware neural architecture search methodology</article-title>. <source>IEEE Trans Comput Aided Des Integr Circuits Syst</source>. <year>2022</year>;<volume>41</volume>(<issue>11</issue>):<fpage>4826</fpage>&#x2013;<lpage>36</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCAD.2021.3134843</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Luo</surname> <given-names>X</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>D</given-names></string-name>, <string-name><surname>Kong</surname> <given-names>H</given-names></string-name>, <string-name><surname>Huai</surname> <given-names>S</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>H</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>W</given-names></string-name></person-group>. <article-title>LightNAS: on lightweight and scalable neural architecture search for embedded platforms</article-title>. <source>IEEE Trans Comput Aided Des Integr Circuits Syst</source>. <year>2023</year>;<volume>42</volume>(<issue>6</issue>):<fpage>1784</fpage>&#x2013;<lpage>97</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCAD.2022.3208187</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Mao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>F</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>G</given-names></string-name></person-group>. <article-title>DLW-NAS: differentiable light-weight neural architecture search</article-title>. <source>Cogn Comput</source>. <year>2023</year>;<volume>15</volume>(<issue>2</issue>):<fpage>429</fpage>&#x2013;<lpage>39</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s12559-022-10046-y</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>XX</given-names></string-name>, <string-name><surname>Chu</surname> <given-names>XX</given-names></string-name>, <string-name><surname>Fan</surname> <given-names>YD</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>ZX</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>XL</given-names></string-name>, <string-name><surname>Yan</surname> <given-names>JC</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>ROME: robustifying memory-efficient NAS via topology disentanglement and gradient accumulation</article-title>. In: <conf-name>2023 IEEE/CVF International Conference on Computer Vision (ICCV 2023); 2023 Oct 1&#x2013;6; Paris, France</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2024</year>. p. <fpage>5916</fpage>&#x2013;<lpage>26</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00546</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Cheng</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>L</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Alsaadi</surname> <given-names>FE</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Differentiable channel pruning guided via attention mechanism: a novel neural network pruning approach</article-title>. <source>Complex Intell Syst</source>. <year>2023</year>;<volume>9</volume>(<issue>5</issue>):<fpage>5611</fpage>&#x2013;<lpage>24</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s40747-023-01022-6</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhou</surname> <given-names>B</given-names></string-name>, <string-name><surname>Khosla</surname> <given-names>A</given-names></string-name>, <string-name><surname>Lapedriza</surname> <given-names>A</given-names></string-name>, <string-name><surname>Oliva</surname> <given-names>A</given-names></string-name>, <string-name><surname>Torralba</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Learning deep features for discriminative localization</article-title>. In: <conf-name>Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2016 Jun 27&#x2013;30</conf-name>; <publisher-loc>Las Vegas, NV, USA</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2016.319</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bicchi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Kumar</surname> <given-names>V</given-names></string-name></person-group>. <article-title>Robotic grasping and contact: a review</article-title>. In: <conf-name>Proceedings of the 2000 IEEE International Conference on Robotics and Automation (ICRA 2000)&#x2014;Millennium Conference; 2000 Apr 24&#x2013;28; San Francisco, CA, USA</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2000</year>. p. <fpage>348</fpage>&#x2013;<lpage>53</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ROBOT.2000.844081</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ferrari</surname> <given-names>C</given-names></string-name>, <string-name><surname>Canny</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Planning optimal grasps</article-title>. In: <conf-name>Proceedings of the 1992 IEEE International Conference on Robotics and Automation (ICRA 1992); 1992 May 12&#x2013;14; Nice, France</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>1992</year>. p. <fpage>2290</fpage>&#x2013;<lpage>5</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ROBOT.1992.219918</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Sahbani</surname> <given-names>A</given-names></string-name>, <string-name><surname>El-Khoury</surname> <given-names>S</given-names></string-name>, <string-name><surname>Bidaud</surname> <given-names>P</given-names></string-name></person-group>. <article-title>An overview of 3D object grasp synthesis algorithms</article-title>. <source>Robot Auton Syst</source>. <year>2012</year>;<volume>60</volume>(<issue>3</issue>):<fpage>326</fpage>&#x2013;<lpage>36</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.robot.2011.07.016</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Bohg</surname> <given-names>J</given-names></string-name>, <string-name><surname>Morales</surname> <given-names>A</given-names></string-name>, <string-name><surname>Asfour</surname> <given-names>T</given-names></string-name>, <string-name><surname>Kragic</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Data-driven grasp synthesis&#x2014;A survey</article-title>. <source>IEEE Trans Robot</source>. <year>2014</year>;<volume>30</volume>(<issue>2</issue>):<fpage>289</fpage>&#x2013;<lpage>309</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TRO.2013.2289018</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Du</surname> <given-names>G</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>K</given-names></string-name>, <string-name><surname>Lian</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>K</given-names></string-name></person-group>. <article-title>Vision-based robotic grasping from object localization, object pose estimation to grasp estimation for parallel grippers: a review</article-title>. <comment>arXiv:1905.06658. 2019</comment>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lenz</surname> <given-names>I</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>H</given-names></string-name>, <string-name><surname>Saxena</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Deep learning for detecting robotic grasps</article-title>. <source>Int J Robot Res</source>. <year>2015</year>;<volume>34</volume>(<issue>4&#x2013;5</issue>):<fpage>705</fpage>&#x2013;<lpage>24</lpage>. doi:<pub-id pub-id-type="doi">10.1177/0278364914549607</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Caldera</surname> <given-names>S</given-names></string-name>, <string-name><surname>Rassau</surname> <given-names>A</given-names></string-name>, <string-name><surname>Chai</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Review of deep learning methods in robotic grasp detection</article-title>. <source>Multimodal Technol Interact</source>. <year>2018</year>;<volume>2</volume>(<issue>3</issue>):<fpage>57</fpage>. doi:<pub-id pub-id-type="doi">10.3390/mti2030057</pub-id>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>C</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>D</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Mart&#x00ED;n-Mart&#x00ED;n</surname> <given-names>R</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Li</surname> <given-names>FF</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>DenseFusion: 6D object pose estimation by iterative dense fusion</article-title>. In: <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2019); 2019 Jun 15-20</conf-name>. <publisher-loc>Long Beach, CA, USA. Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2019</year>. p. <fpage>3338</fpage>&#x2013;<lpage>47</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2019.00346</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Spatial pyramid pooling in deep convolutional networks for visual recognition</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2015</year>;<volume>37</volume>(<issue>9</issue>):<fpage>1904</fpage>&#x2013;<lpage>16</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TPAMI.2015.2389824</pub-id>; <pub-id pub-id-type="pmid">26353135</pub-id></mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>LC</given-names></string-name>, <string-name><surname>Papandreou</surname> <given-names>G</given-names></string-name>, <string-name><surname>Kokkinos</surname> <given-names>I</given-names></string-name>, <string-name><surname>Murphy</surname> <given-names>K</given-names></string-name>, <string-name><surname>Yuille</surname> <given-names>AL</given-names></string-name></person-group>. <article-title>DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2018</year>;<volume>40</volume>(<issue>4</issue>):<fpage>834</fpage>&#x2013;<lpage>48</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TPAMI.2017.2699184</pub-id>; <pub-id pub-id-type="pmid">28463186</pub-id></mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gui</surname> <given-names>S</given-names></string-name>, <string-name><surname>Gui</surname> <given-names>K</given-names></string-name>, <string-name><surname>Luximon</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>GoalGrasp: grasping goals in partially occluded scenarios without grasp training</article-title>. <source>IEEE Trans Ind Inform</source>. <year>2025</year>;<volume>21</volume>(<issue>7</issue>):<fpage>5160</fpage>&#x2013;<lpage>70</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TII.2025.3552653</pub-id>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>G&#x00FC;ney</surname> <given-names>E</given-names></string-name>, <string-name><surname>Bay&#x0131;lm&#x0131;&#x015F;</surname> <given-names>C</given-names></string-name>, <string-name><surname>&#x00C7;akar</surname> <given-names>S</given-names></string-name>, <string-name><surname>Erol</surname> <given-names>E</given-names></string-name>, <string-name><surname>Atmaca</surname> <given-names>&#x00D6;</given-names></string-name></person-group>. <article-title>Autonomous control of shore robotic charging systems based on computer vision</article-title>. <source>Expert Syst Appl</source>. <year>2024</year>;<volume>238</volume>(<issue>6</issue>):<fpage>122116</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.eswa.2023.122116</pub-id>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Han</surname> <given-names>C</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>G</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Real-time small traffic sign detection with revised faster-RCNN</article-title>. <source>Multimed Tools Appl</source>. <year>2019</year>;<volume>78</volume>(<issue>10</issue>):<fpage>13263</fpage>&#x2013;<lpage>78</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11042-018-6428-0</pub-id>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Morrison</surname> <given-names>D</given-names></string-name>, <string-name><surname>Corke</surname> <given-names>P</given-names></string-name>, <string-name><surname>Leitner</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Learning robust, real-time, reactive robotic grasping</article-title>. <source>Int J Robot Res</source>. <year>2020</year>;<volume>39</volume>(<issue>2&#x2013;3</issue>):<fpage>183</fpage>&#x2013;<lpage>201</lpage>. doi:<pub-id pub-id-type="doi">10.1177/0278364919859066</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Yu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhai</surname> <given-names>DH</given-names></string-name>, <string-name><surname>Xia</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Liao</surname> <given-names>J</given-names></string-name></person-group>. <article-title>SE-ResUNet: a novel robotic grasp detection method</article-title>. <source>IEEE Robot Autom Lett</source>. <year>2022</year>;<volume>7</volume>(<issue>2</issue>):<fpage>5238</fpage>&#x2013;<lpage>45</lpage>. doi:<pub-id pub-id-type="doi">10.1109/LRA.2022.3145064</pub-id>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Deng</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Recent progress in semantic image segmentation</article-title>. <source>Artif Intell Rev</source>. <year>2019</year>;<volume>52</volume>(<issue>2</issue>):<fpage>1089</fpage>&#x2013;<lpage>106</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10462-018-9641-3</pub-id>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Du</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>X</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>C</given-names></string-name></person-group>. <article-title>A hierarchical evolution of neural architecture search method based on state transition algorithm</article-title>. <source>Int J Mach Learn Cybern</source>. <year>2023</year>;<volume>14</volume>(<issue>8</issue>):<fpage>2723</fpage>&#x2013;<lpage>38</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s13042-023-01794-w</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Dong</surname> <given-names>J</given-names></string-name>, <string-name><surname>Hou</surname> <given-names>B</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Tan</surname> <given-names>KC</given-names></string-name>, <string-name><surname>Ong</surname> <given-names>YS</given-names></string-name></person-group>. <article-title>A cell-based fast memetic algorithm for automated convolutional neural architecture design</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. <year>2023</year>;<volume>34</volume>(<issue>11</issue>):<fpage>9040</fpage>&#x2013;<lpage>53</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TNNLS.2022.3155230</pub-id>; <pub-id pub-id-type="pmid">35298385</pub-id></mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Simonyan</surname> <given-names>K</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>DARTS: differentiable architecture search</article-title>. <comment>arXiv:1806.09055. 2018</comment>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jagadheesh</surname> <given-names>S</given-names></string-name>, <string-name><surname>Bhanu</surname> <given-names>PV</given-names></string-name>, <string-name><surname>Soumya</surname> <given-names>J</given-names></string-name></person-group>. <article-title>NoC application mapping optimization using reinforcement learning</article-title>. <source>ACM Trans Des Autom Electron Syst</source>. <year>2022</year>;<volume>27</volume>(<issue>6</issue>):<fpage>1</fpage>&#x2013;<lpage>16</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3510381</pub-id>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Cao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Z</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Swin Transformer: hierarchical vision transformer using shifted windows</article-title>. In: <conf-name>2021 IEEE/CVF International Conference on Computer Vision (ICCV 2021); 2021 Oct 11&#x2013;17; Montreal, QC, Canada</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2021</year>. p. <fpage>9992</fpage>&#x2013;<lpage>10002</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00986</pub-id>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Lin</surname> <given-names>M</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Yan</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Network in network</article-title>. <comment>arXiv:1312.4400. 2013</comment>.</mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2016); 2016 Jun 27&#x2013;30; Las Vegas, NV, USA</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2016</year>. p. <fpage>770</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Hu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>L</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Squeeze-and-excitation networks</article-title>. In: <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2018); 2018 Jun 18&#x2013;23; Salt Lake City, UT, USA</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2018</year>. p. <fpage>7132</fpage>&#x2013;<lpage>41</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2018.00745</pub-id>.</mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Woo</surname> <given-names>S</given-names></string-name>, <string-name><surname>Park</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>JY</given-names></string-name>, <string-name><surname>Kweon</surname> <given-names>IS</given-names></string-name></person-group>. <chapter-title>CBAM: convolutional block attention module</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Ferrari</surname> <given-names>V</given-names></string-name>, <string-name><surname>Hebert</surname> <given-names>M</given-names></string-name>, <string-name><surname>Sminchisescu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Weiss</surname> <given-names>Y</given-names></string-name></person-group>, editors. <source>Computer Vision&#x2014;ECCV 2018</source>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2018</year>. p. <fpage>3</fpage>&#x2013;<lpage>19</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id>.</mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Han</surname> <given-names>K</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>H</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>A survey on vision transformer</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2023</year>;<volume>45</volume>(<issue>1</issue>):<fpage>87</fpage>&#x2013;<lpage>110</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TPAMI.2022.3152247</pub-id>; <pub-id pub-id-type="pmid">35180075</pub-id></mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name>, <string-name><surname>Li</surname> <given-names>M</given-names></string-name>, <string-name><surname>Pan</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Learning a sparse transformer network for effective image deraining</article-title>. In: <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2023); 2023 Jun 17&#x2013;24; Vancouver, BC, Canada</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2023</year>. p. <fpage>5896</fpage>&#x2013;<lpage>905</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00571</pub-id>.</mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ouyang</surname> <given-names>D</given-names></string-name>, <string-name><surname>He</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>M</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhan</surname> <given-names>J</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Efficient multi-scale attention module with cross-spatial learning</article-title>. In: <conf-name>2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023); 2023 Jun 4&#x2013;10; Rhodes Island, Greece</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>Rhodes Island</publisher-name>; <year>2023</year>. p. <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICASSP49357.2023.10096516</pub-id>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kumra</surname> <given-names>S</given-names></string-name>, <string-name><surname>Joshi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sahin</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Antipodal robotic grasping using generative residual convolutional neural network</article-title>. In: <conf-name>2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2020); 2020 Oct 24&#x2013;Nov 24; Las Vegas, NV, USA</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2020</year>. p. <fpage>9626</fpage>&#x2013;<lpage>33</lpage>. doi:<pub-id pub-id-type="doi">10.1109/iros45743.2020.9340777</pub-id>.</mixed-citation></ref>
<ref id="ref-46"><label>[46]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Szegedy</surname> <given-names>C</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>W</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Sermanet</surname> <given-names>P</given-names></string-name>, <string-name><surname>Reed</surname> <given-names>S</given-names></string-name>, <string-name><surname>Anguelov</surname> <given-names>D</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Going deeper with convolutions</article-title>. In: <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2015); 2015 Jun 7&#x2013;12; Boston, MA, USA</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2015</year>. p. <fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298594</pub-id>.</mixed-citation></ref>
<ref id="ref-47"><label>[47]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Howard</surname> <given-names>AG</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>B</given-names></string-name>, <string-name><surname>Kalenichenko</surname> <given-names>D</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Weyand</surname> <given-names>T</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>MobileNets: efficient convolutional neural networks for mobile vision applications</article-title>. <comment>arXiv:1704.04861. 2017</comment>.</mixed-citation></ref>
<ref id="ref-48"><label>[48]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Krizhevsky</surname> <given-names>A</given-names></string-name>, <string-name><surname>Sutskever</surname> <given-names>I</given-names></string-name>, <string-name><surname>Hinton</surname> <given-names>GE</given-names></string-name></person-group>. <article-title>ImageNet classification with deep convolutional neural networks</article-title>. <source>Commun ACM</source>. <year>2017</year>;<volume>60</volume>(<issue>6</issue>):<fpage>84</fpage>&#x2013;<lpage>90</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3065386</pub-id>.</mixed-citation></ref>
<ref id="ref-49"><label>[49]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Jiang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Moseson</surname> <given-names>S</given-names></string-name>, <string-name><surname>Saxena</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Efficient grasping from RGBD images: learning using a new rectangle representation</article-title>. In: <conf-name>2011 IEEE International Conference on Robotics and Automation (ICRA 2011); 2011 May 9&#x2013;13; Shanghai, China</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2011</year>. p. <fpage>3304</fpage>&#x2013;<lpage>11</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICRA.2011.5980145</pub-id>.</mixed-citation></ref>
<ref id="ref-50"><label>[50]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Depierre</surname> <given-names>A</given-names></string-name>, <string-name><surname>Dellandr&#x00E9;a</surname> <given-names>E</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>L</given-names></string-name></person-group>. <article-title>Jacquard: a large scale dataset for robotic grasp detection</article-title>. In: <conf-name>2018 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018); 2018 Oct 1&#x2013;5; Madrid, Spain</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2018</year>. p. <fpage>3511</fpage>&#x2013;<lpage>6</lpage>. doi:<pub-id pub-id-type="doi">10.1109/IROS.2018.8593950</pub-id>.</mixed-citation></ref>
<ref id="ref-51"><label>[51]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>X</given-names></string-name>, <string-name><surname>Dong</surname> <given-names>T</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Song</surname> <given-names>H</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>DSNet: double strand robotic grasp detection network based on cross attention</article-title>. <source>IEEE Robot Autom Lett</source>. <year>2024</year>;<volume>9</volume>(<issue>5</issue>):<fpage>4702</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1109/LRA.2024.3381091</pub-id>.</mixed-citation></ref>
<ref id="ref-52"><label>[52]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kumra</surname> <given-names>S</given-names></string-name>, <string-name><surname>Kanan</surname> <given-names>C</given-names></string-name></person-group>. <article-title>Robotic grasp detection using deep convolutional neural networks</article-title>. In: <conf-name>2017 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2017); 2017 Sep 24&#x2013;28; Vancouver, BC, Canada</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2017</year>. p. <fpage>769</fpage>&#x2013;<lpage>76</lpage>. doi:<pub-id pub-id-type="doi">10.1109/IROS.2017.8202237</pub-id>.</mixed-citation></ref>
<ref id="ref-53"><label>[53]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Guo</surname> <given-names>D</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>F</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Kong</surname> <given-names>T</given-names></string-name>, <string-name><surname>Fang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Xi</surname> <given-names>N</given-names></string-name></person-group>. <article-title>A hybrid deep architecture for robotic grasp detection</article-title>. In: <conf-name>2017 IEEE International Conference on Robotics and Automation (ICRA 2017); 2017 May 29&#x2013;Jun 3; Singapore</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2017</year>. p. <fpage>1609</fpage>&#x2013;<lpage>14</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICRA.2017.7989191</pub-id>.</mixed-citation></ref>
<ref id="ref-54"><label>[54]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Asif</surname> <given-names>U</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Harrer</surname> <given-names>S</given-names></string-name></person-group>. <article-title>GraspNet: an efficient convolutional neural network for real-time grasp detection for low-powered devices</article-title>. In: <conf-name>Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence (IJCAI 2018); 2018 Jul 13&#x2013;19; Stockholm, Sweden</conf-name>. <publisher-loc>Menlo Park, CA, USA</publisher-loc>: <publisher-name>International Joint Conferences on Artificial Intelligence Organization (IJCAI Org.)</publisher-name>; <year>2018</year>. p. <fpage>4875</fpage>&#x2013;<lpage>82</lpage>. doi:<pub-id pub-id-type="doi">10.24963/ijcai.2018/677</pub-id>.</mixed-citation></ref>
<ref id="ref-55"><label>[55]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Karaoguz</surname> <given-names>H</given-names></string-name>, <string-name><surname>Jensfelt</surname> <given-names>P</given-names></string-name></person-group>. <article-title>Object detection approach for robot grasp detection</article-title>. In: <conf-name>2019 IEEE International Conference on Robotics and Automation (ICRA 2019); 2019 May 20&#x2013;24; Montreal, QC, Canada</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2019</year>. p. <fpage>4953</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICRA.2019.8793751</pub-id>.</mixed-citation></ref>
<ref id="ref-56"><label>[56]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Kan</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>When transformer meets robotic grasping: exploits context for efficient grasp detection</article-title>. <source>IEEE Robot Autom Lett</source>. <year>2022</year>;<volume>7</volume>(<issue>3</issue>):<fpage>8170</fpage>&#x2013;<lpage>7</lpage>. doi:<pub-id pub-id-type="doi">10.1109/LRA.2022.3187261</pub-id>.</mixed-citation></ref>
<ref id="ref-57"><label>[57]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhou</surname> <given-names>X</given-names></string-name>, <string-name><surname>Lan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Fully convolutional grasp detection network with oriented anchor box</article-title>. In: <conf-name>2018 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018); 2018 Oct 1&#x2013;5; Madrid, Spain</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2018</year>. p. <fpage>7223</fpage>&#x2013;<lpage>30</lpage>. doi:<pub-id pub-id-type="doi">10.1109/IROS.2018.8594116</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>
















