<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMES</journal-id>
<journal-id journal-id-type="nlm-ta">CMES</journal-id>
<journal-id journal-id-type="publisher-id">CMES</journal-id>
<journal-title-group>
<journal-title>Computer Modeling in Engineering &#x0026; Sciences</journal-title>
</journal-title-group>
<issn pub-type="epub">1526-1506</issn>
<issn pub-type="ppub">1526-1492</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">29729</article-id>
<article-id pub-id-type="doi">10.32604/cmes.2023.029729</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Role Dynamic Allocation of Human-Robot Cooperation Based on Reinforcement Learning in an Installation of Curtain Wall</article-title>
<alt-title alt-title-type="left-running-head">Role Dynamic Allocation of Human-Robot Cooperation Based on Reinforcement Learning in an Installation of Curtain Wall</alt-title>
<alt-title alt-title-type="right-running-head">Role Dynamic Allocation of Human-Robot Cooperation Based on Reinforcement Learning in an Installation of Curtain Wall</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Liu</surname><given-names>Zhiguang</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Wang</surname><given-names>Shilin</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-3" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Zhao</surname><given-names>Jian</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><email>Zhaojian_tju@163.com</email></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Hao</surname><given-names>Jianhong</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Yu</surname><given-names>Fei</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Control and Mechanical Engineering, Tianjin Chengjian University</institution>, <addr-line>Tianjin, 300384</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Comprehensive Business Department, CATARC (Tianjin) Automotive Engineering Research Institute Co., Ltd.</institution>, <addr-line>300339</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>School of Mechanical Engineering, Hebei University of Technology</institution>, <addr-line>Tianjin, 300130</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Jian Zhao. Email: <email>Zhaojian_tju@163.com</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2023</year></pub-date>
<pub-date date-type="pub" publication-format="electronic"><day>22</day><month>9</month><year>2023</year></pub-date>
<volume>138</volume>
<issue>1</issue>
<fpage>473</fpage>
<lpage>487</lpage>
<history>
<date date-type="received"><day>05</day><month>3</month><year>2023</year></date>
<date date-type="accepted"><day>09</day><month>5</month><year>2023</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2024 Liu et al.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Liu et al.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMES_29729.pdf"></self-uri>
<abstract>
<p>A real-time adaptive roles allocation method based on reinforcement learning is proposed to improve human-robot cooperation performance for a curtain wall installation task. This method breaks the traditional idea that the robot is regarded as the follower or only adjusts the leader and the follower in cooperation. In this paper, a self-learning method is proposed which can dynamically adapt and continuously adjust the initiative weight of the robot according to the change of the task. Firstly, the physical human-robot cooperation model, including the role factor is built. Then, a reinforcement learning model that can adjust the role factor in real time is established, and a reward and action model is designed. The role factor can be adjusted continuously according to the comprehensive performance of the human-robot interaction force and the robot&#x2019;s <italic>Jerk</italic> during the repeated installation. Finally, the roles adjustment rule established above continuously improves the comprehensive performance. Experiments of the dynamic roles allocation and the effect of the performance weighting coefficient on the result have been verified. The results show that the proposed method can realize the role adaptation and achieve the dual optimization goal of reducing the sum of the cooperator force and the robot&#x2019;s <italic>Jerk</italic>.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Human-robot cooperation</kwd>
<kwd>roles allocation</kwd>
<kwd>reinforcement learning</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Tianjin Education Commission Scientific Research Program</funding-source>
<award-id>2020KJ056</award-id>
</award-group>
<award-group id="awg2">
<funding-source>China, and Tianjin Science and Technology</funding-source>
<award-id>22YDTPJC00970</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1"><label>1</label><title>Introduction</title>
<p>With the research on human-robot cooperation and intelligent robot technology, we recognize that tasks can be completed more efficiently and smoothly by endowing the robot with specific initiatives [<xref ref-type="bibr" rid="ref-1">1</xref>,<xref ref-type="bibr" rid="ref-2">2</xref>]. Many studies have examined the claim that the human is the leader and the robot is the follower during cooperation [<xref ref-type="bibr" rid="ref-3">3</xref>]. As auxiliary equipment, the robot can help the human increase or decrease force by collecting interactive signals, reducing the partner&#x2019;s working intensity [<xref ref-type="bibr" rid="ref-4">4</xref>]. That researches mainly focus on master-slave and follow-up robot control algorithms [<xref ref-type="bibr" rid="ref-5">5</xref>,<xref ref-type="bibr" rid="ref-6">6</xref>]. However, in some practical tasks, human and robots must be leaders and followers [<xref ref-type="bibr" rid="ref-7">7</xref>,<xref ref-type="bibr" rid="ref-8">8</xref>]. An additional complication is that the roles of leader and follower may need to be changed during the task. Several researchers have addressed the issue of the different roles of humans and robots in cooperative tasks. For example, Lawitzky et al. [<xref ref-type="bibr" rid="ref-9">9</xref>] have shown that task performance is improved through a higher degree of assistance by the robot in the human-robot moving an object task. Some researchers [<xref ref-type="bibr" rid="ref-10">10</xref>&#x2013;<xref ref-type="bibr" rid="ref-12">12</xref>] have tried to create a continuous function by rapidly switching between two distinct extreme behaviors (leader and follower) to change the cooperative role. In order to develop assistance adaptation schemes, Passenberg and others present a force-based criterion for distinguishing between the two scenarios and introduce an approach to optimize the assistance levels for each scenario [<xref ref-type="bibr" rid="ref-13">13</xref>]. According to the observation that human-human interaction is not defined as a proportion of role allocation in advance, some researchers try to study approaches that allow online investigation of the dominance distribution between partners depending on different situations [<xref ref-type="bibr" rid="ref-14">14</xref>]. For comparing the cooperation performance from the fixed role method and the adaptive control role switching method, some researchers [<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-16">16</xref>] investigated a method for the simultaneous switching of two roles between a robot and a human participant. They have proven that the adaptive online role-adjusting method has a higher success rate than the fixed role method.</p>
<p>In the recent related research work, literature [<xref ref-type="bibr" rid="ref-17">17</xref>] is a further study of the dynamic role assignment (RDA) algorithm [<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-16">16</xref>] based on the homotopy method. Robots know the target location and task content to plan their motion trajectory, while humans act as task correctors. Specifically, when the robot plays the &#x201C;leader&#x201D; role, the robot follows a pre-planned trajectory; When the robot&#x2019;s movement track does not meet the task requirements, the human plays the role of &#x2018;leader&#x2019; and intervenes (corrects) the robot&#x2019;s movement. However, the robot&#x2019;s trajectory cannot be planned in tasks with unknown and variable targets. Therefore, the above RDA method is no longer applicable, as shown in the scene of the human-robot Cooperative curtain wall assembly in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>. A three-module framework (HMRDA) of human-robot cooperative motion target prediction module, role dynamic assignment module, and robot motion planning module was designed, and a dynamic role assignment method based on goal prediction and fuzzy reasoning was proposed [<xref ref-type="bibr" rid="ref-18">18</xref>]. According to motion information and prediction information, the robot can adjust its role in human-robot cooperative motion to change the motion trajectory. However, the above HmrDA-based approach can change the binary problem where the role is only leader and follower, rather than the role adjusting more weight to the leader or follower. In addition, the premise of changing the role is that the robot can accurately recognize human intention. Compared with the dynamic adjustment of role, the article&#x2019;s authors [<xref ref-type="bibr" rid="ref-18">18</xref>] have contributed more to the recognition of robot intention.</p>
<fig id="fig-1"><label>Figure 1</label><caption><title>The architecture of the relationship among the optimized task, research methods, and the human-robot cooperation</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-1.tif"/></fig>
<p>Reinforcement learning is often an effective method to solve the problem of parameter recognition or robot imitation learning in human-robot cooperation [<xref ref-type="bibr" rid="ref-19">19</xref>]. Like the role allocation problem, reinforcement learning is often used to learn model-free strategies in practical robot control. The online, self, and adaptive learning algorithm is applied not only in human-robot cooperation but also in the problems of dynamic parameter difficulty and nonlinear control. For example, in literature [<xref ref-type="bibr" rid="ref-20">20</xref>], they adopt reinforcement learning methods constructed a control policy inside the multi-dimensional chaotic region to solve the problem of higher-order, coupled, 3D under-actuated manipulator with non-parametric uncertainties, control signal delay (input delay), and actuator saturation. Literature [<xref ref-type="bibr" rid="ref-21">21</xref>] proposed a reinforcement learning method called the &#x2018;CPGactor-critic&#x2019; to control a humanoid robot leg, which can successfully control a system with a large DOF but a small number of actor parameters. In addition, the excessive increase in the input dimensionality of the critic could be avoided. The above literature methods provide ideas for using reinforcement learning to solve the role assignment problem in human-robot interaction in this paper.</p>
<p>In this paper, we use reinforcement learning to adjust the roles allocation of the human and robot so that we can install the glass curtain wall unit more efficiently and quickly. First, the physical human-robot cooperation model, including the role factor, is built. Second, a reinforcement learning model which can adjust the role factor in real-time is established, and a reward and action model are designed. The role factor can be adjusted continuously according to the comprehensive performance of the human-robot interaction force and robot&#x2019;s <italic>Jerk</italic> during the repeated installation process. Finally, the experiments of the dynamic role allocation and the effect of the performance weighting coefficient on the result have been verified. The results show that the proposed method can realize the role adaptation and achieve the dual optimization goal of reducing the sum of the cooperator&#x2019;s force and the robot&#x2019;s <italic>Jerk</italic>. Compared with the existing role allocation methods, the established role model is not only a leader and a follower but a more precise division of roles, which is more suitable for occasions when the boundary between the leader and the follower is blurred in tasks. In addition, the enhanced learning algorithm is used to learn the changing rules of the role. The intelligence of the robot is enhanced by imitating the idea that human beings use incentives and training methods to improve intelligence to explore and solve the problems caused by the robot mental retardation in man-machine cooperation, such as low cooperation efficiency, the heavy labor intensity of operators and application difficulties. The main contributions of this work are as follows: (1) The role adjustment model and comprehensive performance model of man-machine cooperation are established. (2) A dynamic role assignment method based on reinforcement learning is proposed. Robots can adjust their role in the man-machine cooperative movement in real-time according to the changes in cooperative tasks, giving full play to the advantages of humans and robots.</p>
<p>An architecture is created to visualize the relationship among the optimized task, traditional control approaches, reinforcement learning, and human-robot cooperation, as shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<p><xref ref-type="fig" rid="fig-1">Fig. 1</xref> shows that this paper transforms the role allocation problem in human-robot cooperation into a dynamic adaptive optimization problem. It compares the differences between traditional control methods and the proposed methods in robot role allocation.</p>
</sec>
<sec id="s2"><label>2</label><title>Problem Statement</title>
<sec id="s2_1"><label>2.1</label><title>Task Description</title>
<p>In this paper, we assume that one scene in which the human is collaborating with the robot to complete an installation task of a glass curtain wall, as shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. There is a two-step process. (1) the curtain wall should be moved to near the preinstallation location rapidly in the low-restricted area; (2) the curtain wall should be precisely installed into the frame in the high-restricted area. In step (1), there is a bigger space of movement, the efficiency of the movement should be concerned, and the effort from the human should be minimized. However, step (2), it is an accurate installing mission, and the curtain wall stability of the robot end-effector under multi-force action should be more concerned than the human&#x2019;s effort.</p>
<fig id="fig-2"><label>Figure 2</label><caption><title>Human-robot cooperation system schematic diagram of curtain wall installation</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-2.tif"/><attrib>Note: The knowledge of the high-restricted and low-restricted areas of the robot is not given, and the high-restricted and low-restricted have no apparent boundaries.</attrib></fig>
</sec>
<sec id="s2_2"><label>2.2</label><title>Human-Robot Cooperation Model and Evaluation Model</title>
<sec id="s2_2_1"><label>2.2.1</label><title>Cooperation Model</title>
<p>A force-based physical cooperation model is built and discussed. The application scenario involves one human and one robot to do a lifting task, as shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>. In this paper, 1 DOF case is established as a research model. However, the definitions may also be valid for more DOFs and partners.</p>
<fig id="fig-3"><label>Figure 3</label><caption><title>Cooperative model based on force interaction in a lifting task</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-3.tif"/></fig>
<p>In <xref ref-type="fig" rid="fig-3">Fig. 3</xref>, <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> respectively represent the interaction force from the human and the robot; <italic>x</italic> is the object&#x2019;s displacement; <italic>m</italic> is the object&#x2019;s mass. The mathematical model acting on an object by two agents can be described as follow based on Newton&#x2019;s Second Law:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>m</mml:mi><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow></mml:math></disp-formula>where, <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> is the acceleration of the object.</p>
</sec>
<sec id="s2_2_2"><label>2.2.2</label><title>Roles Model</title>
<p>From previous studies [<xref ref-type="bibr" rid="ref-14">14</xref>], the expression forms of the contribution level to the contributions for moving the object can be described as follows:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where, the <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and the <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> respectively represent the role values of the human and the robot.</p>
</sec>
<sec id="s2_2_3"><label>2.2.3</label><title>Evaluation Model</title>
<p>a) <italic>Force Model of Human</italic></p>
<p>The total energy paid by the collaborator or the sum of the interaction force from the human is generally used to be described to evaluate the human&#x2019;s effort in the cooperation process [<xref ref-type="bibr" rid="ref-14">14</xref>]. The total energy from the partner to complete the task is challenging to measure directly during the cooperation process. However, the total force of the collaborator is more easily measured. Therefore, the partner&#x2019;s force sum is used to estimate the partner&#x2019;s effort in this paper. The sum of the force model of the human <italic>THF</italic> (<italic>Total Human Force</italic>) is established in <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>T</mml:mi><mml:mi>H</mml:mi><mml:mi>F</mml:mi><mml:mo>=</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo></mml:math></disp-formula>where, <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the current time corresponding to the discrete movement steps, <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the cooperative force, and <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo></mml:math></inline-formula> represents the norm.</p>
<p>b) <italic>Compliant Model of Robot</italic></p>
<p>In the field of robotics, the <italic>Jerk</italic> is often used to describe the flexibility of a robot. The smaller the <italic>Jerk</italic> is, the smoother the system is. In this paper, the sum of the <italic>Jerk</italic> is used to assess the end-point flexibility of the robot, and the compliant model of robot <italic>TJerk</italic> (<italic>Total Jerk</italic>) is established in <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref>:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mrow><mml:mtext mathvariant="italic">TJerk</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mrow><mml:mover><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mo>&#x20DB;</mml:mo></mml:mover></mml:mrow><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo></mml:math></disp-formula>where, the <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mrow><mml:mover><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mo>&#x20DB;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> represents the <italic>Jerk</italic> at the end of the cooperative robot.</p>
<p>c) <italic>Comprehensive Evaluation Model</italic></p>
<p>A comprehensive evaluation model that reflects human effort and robotic compliance is estimated to evaluate the performance of human-robot cooperation in this paper, and it is defined as follows:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mi>O</mml:mi><mml:mi>E</mml:mi><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>T</mml:mi><mml:mi>H</mml:mi><mml:mi>F</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mrow><mml:mtext mathvariant="italic">TJerk</mml:mtext></mml:mrow></mml:math></disp-formula>where, <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are performance weighting coefficients, and their values meet the following conditions:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>In <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>, if the weighting factor is designed as <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>, it indicates that only the human effort is considered in the cooperative performance. On the contrary, if the weighting factor is designed as <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>, the single factor of robot compliance is used as the evaluation parameter. However, if the weighting factor is designed as <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn></mml:math></inline-formula>, the human effort and the robot compliance are regarded as equally important. The weighting factor value significantly impacts cooperative performance, which affects the result of role allocation. In this paper, the impact on role allocation is discussed separately under the same and different weighting factor.</p>
</sec>
</sec>
</sec>
<sec id="s3"><label>3</label><title>Dynamic Role Adaptive Allocation Design</title>
<sec id="s3_1"><label>3.1</label><title>Overall Framework</title>
<p>Reinforcement learning is a method that can realize adaptive parameter adjustment online and establish the relationship between action and state uncertainty according to the target. In this paper, this method is used to adjust role parameters during the process of cooperation. The overall architecture of the method is shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>.</p>
<fig id="fig-4"><label>Figure 4</label><caption><title>Roles allocation method based on reinforcement learning in human-robot cooperation</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-4.tif"/></fig>
<p>In <xref ref-type="fig" rid="fig-4">Fig. 4</xref>, <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is the role factor, which regulates the active and passive relationship between the collaborator and the robot in a cooperative task. The <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is dynamically adjusted according to the performance of the comprehensive evaluation model. The robot is controlled by the admittance method [<xref ref-type="bibr" rid="ref-22">22</xref>,<xref ref-type="bibr" rid="ref-23">23</xref>]. The robot admittance control model is shown in <xref ref-type="disp-formula" rid="eqn-7">Eq. (7)</xref>:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x02D9;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x02D9;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula>where, <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the expected mass matrix, damping matrix, and stiffness matrix, respectively. The <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x02D9;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> are the expected acceleration, velocity, and position of the end of the robot, and the <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x02D9;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> are the actual acceleration, velocity, and position of the robot&#x2019;s end.</p>
</sec>
<sec id="s3_2"><label>3.2</label><title>Reinforcement Learning</title>
<p>This paper proposes a reinforcement learning model to change roles allocation weight during the installation of glass curtain walls in human-robot cooperation, as shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>.</p>
<fig id="fig-5"><label>Figure 5</label><caption><title>Principle diagram of dynamic role allocation based on reinforcement learning method</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-5.tif"/></fig>
<p>Here the roles allocation value <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> adjusted online according to the system&#x2019;s current state and reward value. The current state <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the motion state of the robot and collaborator&#x2019;s force, and the reward value is calculated as a designed model described below. A Q-learning model [<xref ref-type="bibr" rid="ref-24">24</xref>] generates the roles allocation algorithm since no prior strategy or sample. The algorithm of Q-learning is shown in Algorithm 1.
</p>
<fig id="fig-14">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-14.tif"/>
</fig>
<p>where, <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> is the learning rate, <italic>s</italic> is the current system state, <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msup><mml:mi>s</mml:mi><mml:mrow><mml:msup><mml:mi></mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msup></mml:math></inline-formula> is the next sampling system state, <italic>a</italic> and <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msup><mml:mi>a</mml:mi><mml:mrow><mml:msup><mml:mi></mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msup></mml:math></inline-formula> is respectively the behavior of the current moment (action) and the next sampling moment, <italic>r</italic> is the return value of the current moment, and <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:mi>&#x03B3;</mml:mi></mml:math></inline-formula> is the discount factor. <italic>&#x03C0;</italic> is the strategy. The machine has to learn, by trial and error in the environment, a &#x201C;strategy&#x201D; &#x03C0;, according to which the action to be performed at state x is obtained: <italic>a</italic> &#x003D; <italic>&#x03C0;</italic> (<italic>s</italic>). In Algorithm 1, the core intermediate variable is <italic>Q</italic>(s, a), which can be referred to as the Q table for short (mainly used for learning deterministic strategy &#x03C0;, namely updating strategy &#x03C0;), representing the expected cumulative reward obtained by the agent choosing action a under state s. Since the dynamic programming method is used and the model is unknown, it is more convenient to use <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi>&#x03B3;</mml:mi></mml:math></inline-formula> discount cumulative reward in this paper. In <xref ref-type="fig" rid="fig-14">formula (8)</xref>, <italic>Q</italic>(s, a) is updated incrementally.</p>
<sec id="s3_2_1"><label>3.2.1</label><title>Action</title>
<p>In this paper, we aim to adjust the roles allocation weight <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> or <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> using reinforcement learning, so the roles allocation weight <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> or <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> should be established an association with the moment <italic>a</italic> parameter of reinforcement learning. The roles allocation values <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are preprocessed by discretization of the parameters and divided into <italic>m</italic>. The action model can be expressed as <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x22EF;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
</sec>
<sec id="s3_2_2"><label>3.2.2</label><title>Reward Design</title>
<p>In the physical human-robot cooperation system, the reward in reinforcement learning should be designed to be associated with comprehensive performance. It is based on minimizing the robot&#x2019;s <italic>Jerk</italic> and minimizing the partner&#x2019;s effort. The return value of cooperation performance is described as follows by <xref ref-type="disp-formula" rid="eqn-9">formula (9)</xref>:
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:msup><mml:mi>T</mml:mi><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mi>T</mml:mi><mml:mo>}</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>&#x03C4;</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03C4;</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x20DB;</mml:mo></mml:mover></mml:mrow><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></disp-formula>where, <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:msup><mml:mi>T</mml:mi><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>T</mml:mi></mml:math></inline-formula> are non-negative terms, the <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:msub><mml:mi>&#x03C4;</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the duration of discrete motion in the task process, <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are the performance weight coefficients. The reward model is designed as follows:</p>
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>&#x03C4;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03C4;</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03C4;</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mrow><mml:mover><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mo>&#x20DB;</mml:mo></mml:mover></mml:mrow><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula><p>where, <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:msub><mml:mi mathvariant="bold-italic">&#x03C4;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow></mml:msub><mml:mo mathvariant="bold">,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">&#x03C4;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi><mml:mo>+</mml:mo><mml:mn mathvariant="bold">1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow></mml:math></inline-formula> represents the sample time of the reinforcement learning, <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow></mml:math></inline-formula> represents the robot-controlled cycle time, and their relationship is shown as follows:</p>
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mi mathvariant="bold-italic">&#x03C4;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mi>t</mml:mi></mml:math></disp-formula><p>where, <italic>k</italic> is an integer greater than zero, and it is the asynchronous adjustment coefficient. The function of the asynchronous adjustment coefficient is to enhance the system&#x2019;s robustness by setting the sampling frequency of the reinforcement learning return value less than the robot control frequency.</p>
<p>At the end of each traversal, the total return value of the traversal can be obtained to evaluate the cooperation performance. The sum of return values is shown as follows:
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03C4;</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mrow><mml:mover><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mo>&#x20DB;</mml:mo></mml:mover></mml:mrow><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
</sec>
</sec>
</sec>
<sec id="s4"><label>4</label><title>Experiment</title>
<sec id="s4_1"><label>4.1</label><title>Experimental Setup and Experimental Design</title>
<p>In order to verify the effectiveness of the method proposed in this paper, an experimental platform was designed for human-robot cooperation to complete the curtain wall installation task, as shown in <xref ref-type="fig" rid="fig-6">Fig. 6</xref>.</p>
<fig id="fig-6"><label>Figure 6</label><caption><title>Curtain wall installation experimental platform by physical human-robot cooperation</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-6.tif"/></fig>
<p>In the experiment, to simulate the installation process and avoid the risk of collision in the actual experimental environment, a laser pointer was fixed to the curtain wall, and a laser point was used to indicate the location of the curtain wall. The curtain wall position indicator experimental device is shown in <xref ref-type="fig" rid="fig-7">Fig. 7</xref>.</p>
<fig id="fig-7"><label>Figure 7</label><caption><title>Curtain wall position indicator experimental device</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-7.tif"/></fig>
<p>In the installation process of the curtain wall, the movement track can be divided into the low-restricted area and the high-restricted area. In the low-restricted area, the robot has plenty of room to move, and more attention should be paid to the speed of movement and less effort of the partner than to the movement accuracy of the curtain wall. Contrary to the low-restricted area, in the high-restricted area, the movement accuracy of the curtain wall should be paid more attention than the speed of movement and the effort of the collaborator.</p>
<p>Here are the steps:
<list list-type="simple">
<list-item><label>1)</label><p>Firstly, the laser point representing the position of the curtain wall was located in the low-restricted area, which was the starting point. Following the robot&#x2019;s movement, the location of the laser point was operated according to the interaction force from a six-dimensional force sensor.</p></list-item>
<list-item><label>2)</label><p>Secondly, the laser point was controlled to move quickly to the high-restricted area entrance.</p></list-item>
<list-item><label>3)</label><p>Thirdly, the operator, ensuring that the laser point does not collide with the boundary of the high-restricted area as much as possible, continues to control the robot towards the target point.</p></list-item>
<list-item><label>4)</label><p>The curtain wall was considered to have reached the target point when the distance between the laser point and the end position of the drawing board was less than a specific value. Then the robot will automatically return to the original position.</p></list-item>
<list-item><label>5)</label><p>Repeated steps 1)&#x2013;4) until the value of the variable that reinforcement learning levels off.</p></list-item>
</list></p>
</sec>
<sec id="s4_2"><label>4.2</label><title>Experimental Parameters Design</title>
<p>In order to obtain the continuous state input vector <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>f</mml:mi><mml:mo>&#x02D9;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo>&#x00A8;</mml:mo></mml:mover></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, each variable was divided into five fuzzy sets and evenly distributed to the domain by a triangular membership function, so the total number of states was <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>N</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mn>5</mml:mn><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mn>625</mml:mn></mml:math></inline-formula>. In this paper, the number of behaviors was set as <italic>m</italic>&#x2009;&#x003D;&#x2009;6, that is, <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:mi>A</mml:mi><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>6</mml:mn></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mn>0.4</mml:mn><mml:mo>,</mml:mo><mml:mn>0.6</mml:mn><mml:mo>,</mml:mo><mml:mn>0.8</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula>. In other words, the robot and the partner have been given six different roles allocation weights. Here, the minimum role of the collaborator was chosen as 0.1 instead of 0 for the following reasons. 1) it was needed that the operator makes a particular decision as moving guidance information to control the robot in the physical human-robot cooperation; 2) it was hazardous when the robot was given complete control in actual operation.</p>
<p>In this study, the system frequency was set to 1000&#x2005;Hz, and the frequency of the reinforcement learning was set to 100&#x2005;Hz. That was the <italic>k</italic>&#x2009;&#x003D;&#x2009;10 in the <xref ref-type="disp-formula" rid="eqn-10">formula (10)</xref>. The learning rate was designed to be <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:mn>0.95</mml:mn></mml:math></inline-formula> and the discount factor was designed to be <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>&#x03B3;</mml:mi><mml:mo>=</mml:mo><mml:mn>0.9</mml:mn></mml:math></inline-formula> in the Q-learning model (7). The role weight initial value of the collaborator was set as <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.4</mml:mn></mml:math></inline-formula>.</p>
</sec>
</sec>
<sec id="s5"><label>5</label><title>Experimental Results and Performace Assessment</title>
<sec id="s5_1"><label>5.1</label><title>Dynamic Role Adaptive Allocation Results</title>
<p>The relationship between human-robot cooperation performance and the roles allocation weight was established with changes in the cooperative performance as the number of iterations increased in this experiment, as shown in <xref ref-type="fig" rid="fig-8">Fig. 8</xref>. The experiment was repeated 30 times by operating the robot from its initial position to its destination. In this process, collisions should be avoided whenever possible by observing the position of the laser point. The comprehensive performance model consisting of robot&#x2019;s <italic>Jerk</italic> and human-robot interaction force that were regarded as equally important (<inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn></mml:math></inline-formula>) was used to estimate the effect of the role change.</p>
<fig id="fig-8"><label>Figure 8</label><caption><title>Relationship between number of iterations and cooperative performance</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-8.tif"/></fig>
<p>As can be seen from <xref ref-type="fig" rid="fig-8">Fig. 8</xref>, with the increase in the number of iterations, the comprehensive performance value composed of <italic>Jerk</italic> and the efforts of the cooperator showed a downward trend. When the number of iterations was more than 25 times, the comprehensive performance value tended to be stable. This experiment showed that it effectively improved robot flexibility and reduced human-robot force by changing the role weight based on reinforcement learning.</p>
<p>In order to evaluate the efficiency of task completion as learning progresses, the relationship between the number of iterations times and task completion time was established, as shown in <xref ref-type="fig" rid="fig-9">Fig. 9</xref>.</p>
<fig id="fig-9"><label>Figure 9</label><caption><title>Relationship between number of iterations times and task completion time</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-9.tif"/></fig>
<p>From <xref ref-type="fig" rid="fig-9">Fig. 9</xref>, the time for task completion decreases as the number of iterations increases. When the number of iterations was more than 25 times, the decline of the task completion time was slow and steady. It has shown that the roles allocation method based on reinforcement learning was a great way to improve task completion efficiency.</p>
<p>The <xref ref-type="fig" rid="fig-10">Fig. 10</xref> shows the mean value <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> of the last five iterations of the collaborator&#x2019;s role over time <italic>t</italic> based on the reinforcement learning method. Initially, the robot was guided by increasing the partner role <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> gradually, and then the partner role <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> dropped rapidly once the robot was identified in free mode. In the low-restricted area, the robot played a significant role in the movement of the curtain wall, which was beneficial in reducing the labor intensity of the collaborator. However, in the stage of approaching and entering the high-restricted area, the collaborator&#x2019;s role value was rapidly increased to ensure the smooth movement of the robot end curtain wall.</p>
<fig id="fig-10"><label>Figure 10</label><caption><title>The mean value of the last five iterations of the collaborator&#x2019;s role over time <italic>t</italic></title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-10.tif"/></fig>
<p>The relationship between the partner&#x2019;s force and time <italic>t</italic> for the last five times was established, as shown in <xref ref-type="fig" rid="fig-11">Fig. 11</xref>. The force change trend of the partner was similar to the collaborator&#x2019;s role, which was first more prominent, then decreased, and then increased. This pattern of change is consistent with the relationship between the change of force and the role. The greater the weight of the partner&#x2019;s role, the greater the force applied.</p>
<fig id="fig-11"><label>Figure 11</label><caption><title>Relationship between the partner&#x2019;s force and time <italic>t</italic> for the last 5 times</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-11.tif"/></fig>
<p>The cooperation performance of the dynamically changing roles according to the method proposed in this paper and the different fixed role weights was established, as shown in <xref ref-type="fig" rid="fig-12">Fig. 12</xref>. The result has verified that the continuous adjustment of roles based on the reinforcement learning method was more conducive to the performance of human-robot cooperation than the fixed roles.</p>
<fig id="fig-12"><label>Figure 12</label><caption><title>Cooperation performance of fixed and adaptive role based reinforcement learning</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-12.tif"/></fig>
<p>In <xref ref-type="fig" rid="fig-12">Fig. 12</xref>, the fixed roles were designed with six levels, which were the <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mn>0.4</mml:mn><mml:mo>,</mml:mo><mml:mn>0.6</mml:mn><mml:mo>,</mml:mo><mml:mn>0.8</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>. Compared to the fixed roles, the combination of <italic>Jerk</italic> and the partner&#x2019;s force was even lower by using the adaptive role based on the reinforcement learning method, which showed that the method proposed in this paper was excellent results.</p>
</sec>
<sec id="s5_2"><label>5.2</label><title>Effect of Performance Weighting Coefficient on the Result</title>
<p>In the above experiment, the robot&#x2019;s <italic>Jerk</italic> and human-robot interaction force were regarded as equally important (<inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn></mml:math></inline-formula>) in the <xref ref-type="disp-formula" rid="eqn-9">formula (9)</xref> of the comprehensive performance model. In order to explore the influence of changing the performance weighting coefficient, the <italic>Jerk</italic>, the partner&#x2019;s force, and their overall performance were obtained through experiments based on reinforcement learning, as shown in <xref ref-type="fig" rid="fig-13">Fig. 13</xref>. The weighting of the partner&#x2019;s force was set as <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mn>0.4</mml:mn><mml:mo>,</mml:mo><mml:mn>0.6</mml:mn><mml:mo>,</mml:mo><mml:mn>0.8</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>.</p>
<fig id="fig-13"><label>Figure 13</label><caption><title>The relationship between overall performance and different performance weighting coefficient of the partner&#x2019; force</title></caption><graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_29729-fig-13.tif"/></fig>
<p>From <xref ref-type="fig" rid="fig-13">Fig. 13</xref>, the <italic>Jerk</italic> was set as the only goal in reinforcement learning when the <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. Even though the sum of the <italic>Jerk</italic> was minimal, the cooperator&#x2019;s force was considerable, which led to more effort for humans. If the minimum sum of the partner&#x2019;s force was designed as the only consideration (<inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>), the sum of the cooperator&#x2019;s force was minimum, but the sum of the robot&#x2019;s accelerations became maximum. Excessive acceleration could lead to vibration at the end of the robot, which would increase control difficulty and danger. The overall performance value was the lowest when the performance weighting coefficient <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn></mml:math></inline-formula>, that was to say, it was appropriate that the comprehensive performance model consisting of human-robot interaction force and robot&#x2019;s <italic>Jerk</italic> were regarded as equally important to estimate the effect of the role change.</p>
</sec>
</sec>
<sec id="s6"><label>6</label><title>Conclusion</title>
<p>In this paper, according to the dynamic role allocation problem in contact human-robot cooperation, an online role allocation method based on reinforcement learning is proposed for a curtain wall installation task. First, the physical human-robot cooperation model, including the role factor, is built. Second, a reinforced learning model, including a reward model and action model, which can adjust the role factor in real-time, is established. The role factor can be adjusted continuously according to the comprehensive performance consisting of human-robot interaction force and robot&#x2019;s <italic>Jerk</italic> during the repeated installation process. Finally, the comprehensive performance of the human-robot system can be continuously improved by the role adjustment rule established according to reinforcement learning. In order to verify the effectiveness of the proposed method, the dynamic role allocation regarding human force and <italic>Jerk</italic> and the effect of the performance weighting coefficient have been verified by experiments. The experimental results show that the proposed method can realize the dynamic adjustment of the human-robot role and achieve the dual optimization goal of reducing the sum of the cooperator&#x2019;s force and the robot&#x2019;s <italic>Jerk</italic>. The role assignment method based on reinforcement learning proposed in this paper is of great significance to physical human-robot cooperation. In future work, to further play the advantages of the role dynamic assignment algorithm proposed in this paper, we will improve the degrees of freedom to the role assignment factor to study. In addition, more complex reward models and execution models with more character values will be built. Meanwhile, the generality of this paper&#x2019;s role dynamic assignment algorithm will be improved to extend it to more practical applications.</p>
</sec>
</body>
<back>
<ack><p>The authors express their gratitude to the editor and referees for their valuable time and efforts on our manuscript.</p></ack>
<sec><title>Funding Statement</title>
<p>The research has been generously supported by Tianjin Education Commission Scientific Research Program (2020KJ056), China, and Tianjin Science and Technology Planning Project (22YDTPJC00970), China. The authors would like to express their sincere appreciation for all support provided.</p></sec>
<sec><title>Author Contributions</title>
<p>Study conception and design: Zhiguang Liu, Jian Zhao; data collection: ShilinWang; analysis and interpretation of results: Zhiguang Liu, Fei Yu; draft manuscript preparation: Zhiguang Liu, Jianhong Hao. All authors reviewed the results and approved the final version of the manuscript.</p></sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>The data used in this paper is available in the paper.</p></sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare that they have no conflicts of interest to report regarding the present study.</p></sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Moertl</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Lawitzky</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Kucukyilmaz</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Sezgin</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Basdogan</surname>, <given-names>C.</given-names></string-name> <etal>et al.</etal></person-group> (<year>2012</year>). <article-title>The role of roles: Physical cooperation between humans and robots</article-title>. <source>International Journal of Robotics Research</source><italic>,</italic> <volume>31</volume><issue>(31)</issue><italic>,</italic> <fpage>1656</fpage>&#x2013;<lpage>1674</lpage>.</mixed-citation></ref>
<ref id="ref-2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jarrasse</surname>, <given-names>N.</given-names></string-name>, <string-name><surname>Sanguineti</surname>, <given-names>V.</given-names></string-name>, <string-name><surname>Burdet</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2013</year>). <article-title>Slaves no longer: Review on role assignment for human robot joint motor action</article-title>. <source>Adaptive Behavior</source><italic>,</italic> <volume>22</volume><issue>(1)</issue><italic>,</italic> <fpage>70</fpage>&#x2013;<lpage>82</lpage>.</mixed-citation></ref>
<ref id="ref-3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Passenberg</surname>, <given-names>C.</given-names></string-name>, <string-name><surname>Peer</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Buss</surname>, <given-names>M.</given-names></string-name></person-group> (<year>2010</year>). <article-title>A survey of environment, operator, and task adapted controllers for teleoperation systems</article-title>. <source>Mechatronics</source><italic>,</italic> <volume>20</volume><issue>(7)</issue><italic>,</italic> <fpage>787</fpage>&#x2013;<lpage>801</lpage>.</mixed-citation></ref>
<ref id="ref-4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Tsarouchi</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Makris</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Chryssolouris</surname>, <given-names>G.</given-names></string-name></person-group> (<year>2016</year>). <article-title>Human robot interaction review and challenges on task planning and programming</article-title>. <source>International Journal of Computer Integrated Manufacturing</source><italic>,</italic> <volume>29</volume><issue>(8)</issue><italic>,</italic> <fpage>916</fpage>&#x2013;<lpage>931</lpage>.</mixed-citation></ref>
<ref id="ref-5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Dimeas</surname>, <given-names>F.</given-names></string-name>, <string-name><surname>Aspragathos</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2016</year>). <article-title>Online stability in human-robot cooperation with admittance control</article-title>. <source>IEEE Transactions on Haptics</source><italic>,</italic> <volume>9</volume><issue>(2)</issue><italic>,</italic> <fpage>267</fpage>&#x2013;<lpage>278</lpage>; <pub-id pub-id-type="pmid">26780819</pub-id></mixed-citation></ref>
<ref id="ref-6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lee</surname>, <given-names>H. J.</given-names></string-name>, <string-name><surname>Kim</surname>, <given-names>K. S.</given-names></string-name>, <string-name><surname>Kim</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2021</year>). <article-title>Generalized control framework for exoskeleton robots by interaction force feedback control</article-title>. <source>International Journal of Control, Automation and systems</source><italic>,</italic> <volume>19</volume><italic>,</italic> <fpage>3419</fpage>&#x2013;<lpage>3427</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>7.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kheddar</surname>, <given-names>A.</given-names></string-name></person-group> (<year>2011</year>). <article-title>Human robot haptic joint actions is an equal control sharing approach possible</article-title>. <conf-name>International Conference on Human System Interactions</conf-name>, pp. <fpage>268</fpage>&#x2013;<lpage>273</lpage>. <conf-loc>Lisbon, Portugal</conf-loc>.</mixed-citation></ref>
<ref id="ref-8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jarrasse</surname>, <given-names>N.</given-names></string-name>, <string-name><surname>Charalambous</surname>, <given-names>T.</given-names></string-name>, <string-name><surname>Burdet</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2012</year>). <article-title>A framework to describe, analyze and generate interactive motor behaviors</article-title>. <source>PLoS One</source><italic>,</italic> <volume>7</volume><issue>(11)</issue><italic>,</italic> <fpage>e49945</fpage>; <pub-id pub-id-type="pmid">23226231</pub-id></mixed-citation></ref>
<ref id="ref-9"><label>9.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Lawitzky</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Mortl</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Hirche</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2010</year>). <article-title>Load sharing in human robot cooperative manipulation</article-title>. <source>19th International Symposium in Robot and Human Interactive Communication</source>, pp. <fpage>185</fpage>&#x2013;<lpage>191</lpage>. <conf-loc>Viareggio, Italy</conf-loc>.</mixed-citation></ref>
<ref id="ref-10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname>, <given-names>Y. N.</given-names></string-name>, <string-name><surname>Tee</surname>, <given-names>K. P.</given-names></string-name>, <string-name><surname>Chan</surname>, <given-names>W. L.</given-names></string-name>, <string-name><surname>Yan</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Chua</surname>, <given-names>Y. W.</given-names></string-name> <etal>et al.</etal></person-group> (<year>2017</year>). <article-title>Continuous role adaptation for human-robot shared control</article-title>. <source>IEEE Transactions on Robotics</source><italic>,</italic> <volume>31</volume><issue>(3)</issue><italic>,</italic> <fpage>672</fpage>&#x2013;<lpage>681</lpage>.</mixed-citation></ref>
<ref id="ref-11"><label>11.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Hang</surname>, <given-names>T.</given-names></string-name>, <string-name><surname>Ueha</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Hirai</surname>, <given-names>H.</given-names></string-name>, <string-name><surname>Miyazaki</surname>, <given-names>F.</given-names></string-name></person-group> (<year>2010</year>). <article-title>A study on dynamical role division in a crank-rotation task from the viewpoint of kinetics and muscle activity analysis</article-title>. <conf-name>IEEE/RSJ International Conference on Intelligent Robots &#x0026; Systems</conf-name>, pp. <fpage>2188</fpage>&#x2013;<lpage>2193</lpage>. <conf-loc>Taibei</conf-loc>.</mixed-citation></ref>
<ref id="ref-12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jaberzadehansari</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Karayiannidis</surname>, <given-names>Y.</given-names></string-name></person-group> (<year>2021</year>). <article-title>Task-based role adaptation for human-robot cooperative object handling</article-title>. <source>IEEE Robotics and Automation Letters</source><italic>,</italic> <volume>2</volume><issue>(6)</issue><italic>,</italic> <fpage>3592</fpage>&#x2013;<lpage>3598</lpage>.</mixed-citation></ref>
<ref id="ref-13"><label>13.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Passenberg</surname>, <given-names>C.</given-names></string-name>, <string-name><surname>Groten</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Peer</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Buss</surname>, <given-names>M.</given-names></string-name></person-group> (<year>2011</year>). <article-title>Towards real time haptic assistance adaptation optimizing task performance and human effort</article-title>. <conf-name>IEEE World Haptics Conference</conf-name>, pp. <fpage>155</fpage>&#x2013;<lpage>160</lpage>. <conf-loc>Istanbul, Turkey</conf-loc>.</mixed-citation></ref>
<ref id="ref-14"><label>14.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Gu</surname>, <given-names>Y.</given-names></string-name>, <string-name><surname>Thobbi</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Sheng</surname>, <given-names>W.</given-names></string-name></person-group> (<year>2011</year>). <article-title>Human robot collaborative manipulation through imitation and reinforcement learning</article-title>. <conf-name>IEEE/RSJ International Conference on Intelligent Robots &#x0026; Systems</conf-name>, pp. <fpage>151</fpage>&#x2013;<lpage>156</lpage>. <conf-loc>Shenzhen, China</conf-loc>.</mixed-citation></ref>
<ref id="ref-15"><label>15.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Evrard</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Kheddar</surname>, <given-names>A.</given-names></string-name></person-group> (<year>2009</year>). <article-title>Homotopy switching model for dyad haptic interaction in physical collaborative tasks</article-title>. <conf-name>World Haptics Third Joint Eurohaptics Conference &#x0026; Symposium on Haptic Interfaces for Virtual Environment &#x0026; Tele-Operator Systems</conf-name>, pp. <fpage>45</fpage>&#x2013;<lpage>50</lpage>. <conf-loc>Salt Lake City, USA</conf-loc>.</mixed-citation></ref>
<ref id="ref-16"><label>16.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Evrard</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Kheddar</surname>, <given-names>A.</given-names></string-name></person-group> (<year>2009</year>). <article-title>Homotopy-based controller for physical human-robot interaction</article-title>. <conf-name>ROMAN 2009-The 18th IEEE International Symposium on Robot and Human Interactive Communication</conf-name>, pp. <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <publisher-loc>Toyama, Japan</publisher-loc>.</mixed-citation></ref>
<ref id="ref-17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Takagi</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Li</surname>, <given-names>Y.</given-names></string-name>, <string-name><surname>Burdet</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2021</year>). <article-title>Flexible assimilation of human&#x2019;s target for versatile human-robot physical interaction</article-title>. <source>IEEE Transactions on Haptics</source><italic>,</italic> <volume>14</volume><issue>(2)</issue><italic>,</italic> <fpage>421</fpage>&#x2013;<lpage>431</lpage>; <pub-id pub-id-type="pmid">33226954</pub-id></mixed-citation></ref>
<ref id="ref-18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname>, <given-names>C.</given-names></string-name>, <string-name><surname>Zhao</surname>, <given-names>J.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Role dynamic assignment of human-robot collaboration based on target prediction and fuzzy inference</article-title>. <source>IEEE Transactions on Industrial Informatics</source><italic>,</italic> <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1109/TII.2023.3266378</pub-id></mixed-citation></ref>
<ref id="ref-19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Modares</surname>, <given-names>H.</given-names></string-name>, <string-name><surname>Ranatunga</surname>, <given-names>I.</given-names></string-name>, <string-name><surname>Lewis</surname>, <given-names>F. L.</given-names></string-name></person-group> (<year>2016</year>). <article-title>Optimized assistive human robot interaction using reinforcement learning</article-title>. <source>IEEE Transactions on Cybernetics</source><italic>,</italic> <volume>46</volume><issue>(3)</issue><italic>,</italic> <fpage>655</fpage>&#x2013;<lpage>667</lpage>; <pub-id pub-id-type="pmid">25823055</pub-id></mixed-citation></ref>
<ref id="ref-20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Tutsoy</surname>, <given-names>O.</given-names></string-name>, <string-name><surname>Barkana</surname>, <given-names>D. E.</given-names></string-name></person-group> (<year>2021</year>). <article-title>Model free adaptive control of the under-actuated robot manipulator with the chaotic dynamics</article-title>. <source>ISA Transactions</source><italic>,</italic> <volume>118</volume><italic>,</italic> <fpage>106</fpage>&#x2013;<lpage>115</lpage>; <pub-id pub-id-type="pmid">33610316</pub-id></mixed-citation></ref>
<ref id="ref-21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Nakamura</surname>, <given-names>Y.</given-names></string-name>, <string-name><surname>Mori</surname>, <given-names>T.</given-names></string-name>, <string-name><surname>Sato</surname>, <given-names>M. A.</given-names></string-name>, <string-name><surname>Ishii</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2007</year>). <article-title>Reinforcement learning for a biped robot based on a CPG-actor-critic method</article-title>. <source>Neural Networks</source><italic>,</italic> <volume>20</volume><issue>(6)</issue><italic>,</italic> <fpage>723</fpage>&#x2013;<lpage>735</lpage>; <pub-id pub-id-type="pmid">17412559</pub-id></mixed-citation></ref>
<ref id="ref-22"><label>22.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Dimeas</surname>, <given-names>F.</given-names></string-name>, <string-name><surname>Aspragathos</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2015</year>). <article-title>Reinforcement learning of variable admittance control for human-robot co-manipulation</article-title>. <conf-name>IEEE/RSJ International Conference on Intelligent Robots &#x0026; Systems</conf-name>, pp. <fpage>1011</fpage>&#x2013;<lpage>1016</lpage>. <conf-loc>Hamburg, Germany</conf-loc>.</mixed-citation></ref>
<ref id="ref-23"><label>23.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Seraji</surname>, <given-names>H.</given-names></string-name></person-group> (<year>1994</year>). <article-title>Adaptive admittance control: An approach to explicit force control in compliant motion</article-title>. <conf-name>IEEE International Conference on Robotics &#x0026; Automation</conf-name>, pp. <fpage>2705</fpage>&#x2013;<lpage>2712</lpage>. <conf-loc>San Diego, CA, USA</conf-loc>.</mixed-citation></ref>
<ref id="ref-24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Watkins</surname>, <given-names>C. J. C. H.</given-names></string-name>, <string-name><surname>Dayan</surname>, <given-names>P.</given-names></string-name></person-group> (<year>1992</year>). <article-title>Q-learning</article-title>. <source>Machine Learning</source><italic>,</italic> <volume>8</volume><issue>(3&#x2013;4)</issue><italic>,</italic> <fpage>279</fpage>&#x2013;<lpage>292</lpage>.</mixed-citation></ref>
</ref-list>
</back></article>