index.html

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/images/Tong_Robotics_Logo.ico" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/Tong_Robotics_Logo.ico">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="Animatronic Face, Robot Facial Expression, Speech-driven Facial Motion Synthesis">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Speech-driven Animatronic Robot Face</title>
  <link rel="icon" type="image/x-icon" href="static/images/Tong_Robotics_Logo.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Driving Animatronic Robot Facial Expression From Speech</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="FIRST AUTHOR PERSONAL LINK" target="_blank">Boren Li</a><sup>*&#8224</sup>,</span>
                <span class="author-block">
                  <a href="SECOND AUTHOR PERSONAL LINK" target="_blank">Hang Li</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="THIRD AUTHOR PERSONAL LINK" target="_blank">Hangxin Liu</a><sup>&#8224</sup>
                  </span>
                  </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Beijing Institute for General Artificial Intelligence (BIGAI)<small><br>State Key Laboratory of General Artificial Intelligence<br></small></span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Equal Contribution, <sup>&#8224</sup>Corresponding Author</small></span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="https://arxiv.org/pdf/2403.12670.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                    <!-- Supplementary PDF link -->
                    <!-- <span class="link-block">
                      <a href="static/pdfs/supplementary_material.pdf" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                      </span>
                      <span>Supplementary</span>
                    </a>
                  </span> -->

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/library87/OpenRoboExp" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2403.12670" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Teaser video-->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop height="100%">
        <!-- Your video here -->
        <source src="static/videos/iros24.mp4"
        type="video/mp4">
      </video>
      <h2 class="subtitle has-text-centered">
        
      </h2>
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Animatronic robots hold the promise of enabling natural human-robot interaction through lifelike facial expressions. However, generating realistic, speech-synchronized robot expressions poses significant challenges due to the complexities of facial biomechanics and the need for responsive motion synthesis. This paper introduces a novel, skinning-centric approach to drive animatronic robot facial expressions from speech input. At its core, the proposed approach employs linear blend skinning (LBS) as a unifying representation, guiding innovations in both embodiment design and motion synthesis. LBS informs the actuation topology, facilitates human expression retargeting, and enables efficient speech-driven facial motion generation. This approach demonstrates the capability to produce highly realistic facial expressions on an animatronic face in real-time at over 4000 fps on a single Nvidia RTX 4090, significantly advancing robots' ability to replicate nuanced human expressions for natural interaction. To foster further research and development in this field, the code has been made publicly available at: https://github.com/library87/OpenRoboExp.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
       <div class="item">
        <!-- Your image here -->
        <img src="static/images/intro.png" alt="intro"/>
        <h2 class="subtitle has-text-justified">
            <strong>Dynamic animatronic robot facial expressions generated from speech.</strong> The figure shows the system's capability to produce diverse and lifelike facial expressions in real-time, synchronized with the corresponding audio speech input. The waveform at the top represents the audio input, while the series of images below showcase the robot's facial responses at different time points.
        </h2>
      </div>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/approach_overview.png" alt="approach_overview"/>
        <h2 class="subtitle has-text-justified">
            <strong>The proposed approach for creating a speech-driven animatronic robot face using LBS.</strong> The approach comprises three major components: (1) <em>skinning-oriented robot development</em> designs and constructs the animatronic face paired with a kinematics simulator based on the target skinning appearance, (2) <em>skinning motion imitation learning</em> involves training an LBS-based model from 3D human demonstrations to generate facial expressions from speech input, and (3) <em>speech-driven robot orchestration</em> generates animatronic facial expressions during inference by utilizing the developed platform, simulator, and learned model. The diagram highlights key development steps, outputs, and inference processes, demonstrating the end-to-end workflow from concept to final animatable robot face.
        </h2>
      </div>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/skinning_oriented_robot_development.png" alt="skinning_oriented_robot_development"/>
        <h2 class="subtitle has-text-justified">
            <strong>The proposed skinning-oriented robot design.</strong> The figure comprises two primary components: (1) <em>LBS-oriented kinematics design</em>, which showcases the facial mesh model with strategically placed control points for various facial features to achieve actuation topology for the facial muscular system that matches the designed LBS motion space and references facial anatomy, and (2) <em>electro-mechanical design and development</em> accounting for physical constraints of the embodiment, including key mechanical components of the skin, skeleton and muscular system, as well as the electrical control system. This comprehensive view demonstrates how the theoretical LBS model is translated into a functional, physically embodied animatronic face.
       </h2>
     </div>
     <div class="item">
      <!-- Your image here -->
      <img src="static/images/imitation_learning.png" alt="imitation_learning"/>
      <h2 class="subtitle has-text-justified">
            <strong>The proposed speech-driven facial skinning motion imitation learning method.</strong> The model architecture (blue section) comprises three key components: (1) a <em>frame-level speech encoder</em> that processes audio input and generates phoneme logits, (2) a <em>speaking style encoder</em> that captures individual speaking styles, and (3) an <em>LBS encoder</em> that generates blendshape coefficients.  During training (red section), the model learns to imitate human facial skinning motions by minimizing the difference between generated and target expressions. In the inference branch (orange section), the trained model generates blendshape coefficients for the robot LBS decoder, producing robot-specific facial skinning motions as reference signals for the downstream kinematics simulator.
      </h2>
    </div>
    <div class="item">
        <!-- Your image here -->
        <img src="static/images/exp1_motion_space.png" alt="exp1_motion_space"/>
        <h2 class="subtitle has-text-justified">
            <strong>Motion Space Validation.</strong> <strong>Actuated blendshape error for different facial regions (left figure):</strong> Color-coded skinning landmarks represent different facial regions for evaluation. The 3D face model shows color-coded landmarks for different facial areas. Error distributions between simulated and physically actuated blendshapes are visualized using violin plots, box plots, and scattered points. Each point represents a single blendshape, evaluated using region-specific landmarks. Median errors (in mm) are provided for each facial region, ranging from 1.76mm (nose) to 8.63mm (jaw). <strong>Qualitative comparison (right figure):</strong> Eight comparisons between simulated and actuated blendshapes are shown. Blendshapes (1)-(6) demonstrate high accuracy, while (7) <em>mouth close</em> and (8) <em>jaw open</em> highlight limitations in the current design, exhibiting maximum errors for their respective regions.
        </h2>
    </div>
    <div class="item">
        <!-- Your image here -->
        <img src="static/images/exp2_tracking.png" alt="exp2_tracking"/>
        <h2 class="subtitle has-text-justified">
            <strong>Tracking Performance Validation.</strong> MSE error distributions between simulated and physically actuated facial articulation sequences are presented using violin plots, box and whisker plots, and scattered points, with each point representing one frame. Evaluation landmarks are grouped by facial region. Ten realistic facial articulation sequences from different speakers with distinct speaking styles were evaluated. Mean median errors across the ten sequences for each facial region are 2.56mm (eye), 3.39mm (brow), 1.74mm (nose), 3.08mm (cheek), 3.86mm (mouth), and 5.03mm (jaw). This comprehensive visualization demonstrates the animatronic face's ability to maintain consistent tracking performance across diverse speaking styles while highlighting region-specific variations in accuracy.
        </h2>
    </div>
  </div>
</div>
</div>
</section>
<!-- End image carousel -->

<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@inproceedings{li2024driving,
         title={Driving Animatronic Robot Facial Expression From Speech},
         author={Li, Boren and Li, Hang and Liu, Hangxin},
         booktitle={IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
         year={2024},
         url={https://arxiv.org/abs/2403.12670}}</code></pre>
    </div>
</section>
<!--End BibTex citation -->
  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>