<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">72544</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.072544</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Real-Time 3D Scene Perception in Dynamic Urban Environments via Street Detection Gaussians</article-title>
<alt-title alt-title-type="left-running-head">Real-Time 3D Scene Perception in Dynamic Urban Environments via Street Detection Gaussians</alt-title>
<alt-title alt-title-type="right-running-head">Real-Time 3D Scene Perception in Dynamic Urban Environments via Street Detection Gaussians</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Du</surname><given-names>Yu</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Guan</surname><given-names>Runwei</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Lam</surname><given-names>Ho-Pun</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Smith</surname><given-names>Jeremy</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Yue</surname><given-names>Yutao</given-names></name><xref ref-type="aff" rid="aff-4">4</xref><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Man</surname><given-names>Ka Lok</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-7" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Li</surname><given-names>Yan</given-names></name><xref ref-type="aff" rid="aff-6">6</xref><email>leeyeon@inha.ac.kr</email></contrib>
<aff id="aff-1"><label>1</label><institution>School of Advanced Technology, Xi&#x2019;an Jiaotong-Liverpool University</institution>, <addr-line>Suzhou, 215123</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou)</institution>, <addr-line>Guangzhou, 511400</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Electrical Engineering and Electronics, University of Liverpool</institution>, <addr-line>Liverpool, L69 7ZX</addr-line>, <country>UK</country></aff>
<aff id="aff-4"><label>4</label><institution>The Hong Kong University of Science and Technology (Guangzhou)</institution>, <addr-line>Guangzhou 511400</addr-line>, <country>China</country></aff>
<aff id="aff-5"><label>5</label><institution>Institute of Deep Perception Technology</institution>, <addr-line>JITRI</addr-line>, Wuxi, 214000, <country>China</country></aff>
<aff id="aff-6"><label>6</label><institution>Department of Electrical and Computer Engineering, Inha University</institution>, <addr-line>Incheon, 402751</addr-line>, <country>Republic of Korea</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Yan Li. Email: <email>leeyeon@inha.ac.kr</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>57</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_72544.pdf"></self-uri>
<abstract>
<p>As a cornerstone for applications such as autonomous driving, 3D urban perception is a burgeoning field of study. Enhancing the performance and robustness of these perception systems is crucial for ensuring the safety of next-generation autonomous vehicles. In this work, we introduce a novel neural scene representation called Street Detection Gaussians (SDGs), which redefines urban 3D perception through an integrated architecture unifying reconstruction and detection. At its core lies the dynamic Gaussian representation, where time-conditioned parameterization enables simultaneous modeling of static environments and dynamic objects through physically constrained Gaussian evolution. The framework&#x2019;s radar-enhanced perception module learns cross-modal correlations between sparse radar data and dense visual features, resulting in a 22% reduction in occlusion errors compared to vision-only systems. A breakthrough differentiable rendering pipeline back-propagates semantic detection losses throughout the entire 3D reconstruction process, enabling the optimization of both geometric and semantic fidelity. Evaluated on the Waymo Open Dataset and the KITTI Dataset, the system achieves real-time performance (135 Frames Per Second (FPS)), photorealistic quality (Peak Signal-to-Noise Ratio (PSNR) 34.9 dB), and state-of-the-art detection accuracy (78.1% Mean Average Precision (mAP)), demonstrating a <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mn>3.8</mml:mn><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> end-to-end improvement over existing hybrid approaches while enabling seamless integration with autonomous driving stacks.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Radar-vision fusion</kwd>
<kwd>differentiable rendering</kwd>
<kwd>autonomous driving perception</kwd>
<kwd>3D reconstruction</kwd>
<kwd>occlusion robustness</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Inha University</funding-source>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>High-fidelity 3D modeling is increasingly being applied to urban scenarios, such as traffic monitoring. While Gaussian Splatting (GS) and Neural Radiance Field(NeRF) based models [<xref ref-type="bibr" rid="ref-1">1</xref>] achieve impressive reconstruction and rendering quality, they do not provide real-time traffic detection and recognition capabilities. Additionally, most existing research primarily focuses on static scenes. Although extensions such as Block-NeRF [<xref ref-type="bibr" rid="ref-2">2</xref>] and GS-based networks [<xref ref-type="bibr" rid="ref-3">3</xref>] aim to address large-scale streets by dividing them into subscenes, they still struggle to achieve real-time monitoring of dynamic objects.</p>
<p>Urban scene reconstruction faces three main challenges: <bold>(1) Speed&#x2013;accuracy tradeoff:</bold> Neural Radiance Field (NeRF, [<xref ref-type="bibr" rid="ref-1">1</xref>]) requires hours per scene, while real-time methods (e.g., 3D Gaussian Splatting (3DGS) [<xref ref-type="bibr" rid="ref-4">4</xref>]) lack semantic detection; <bold>(2) Occlusion handling:</bold> Moving objects frequently block critical traffic elements; <bold>(3) Scalability:</bold> Large-scale scenes (&#x003E;1 km<sup>2</sup>) require efficient memory usage.</p>
<p>Dynamic urban perception requires not only real-time reconstruction and rendering but also reliable object detection and robustness under occlusion. However, street-scale Gaussian methods such as Street Gaussians and 4D Gaussian Splatting largely optimize for view synthesis, lacking detection-aware training and cross-sensor fusion. To close this gap, we propose <italic>Street Detection Gaussian</italic> (SDG), which integrates detection supervision into the Gaussian pipeline, fuses millimeter-wave (mmWave) radar for depth reliability, and models movers with time-conditioned Gaussians while keeping static backgrounds in 3DGS. SDG further leverages large multi-modal models [<xref ref-type="bibr" rid="ref-5">5</xref>] for frame-level semantics, yielding a perception-oriented, real-time solution for dynamic urban scenes.</p>
<p>Compared with existing Gaussian- and NeRF-based approaches, <bold>Street Detection Gaussians (SDG)</bold> introduces several fundamental differences. Unlike traditional 3DGS and Street Gaussians that rely solely on image features for static reconstruction, SDG incorporates object-level semantics from Grounded-SAM to guide Gaussian placement and density. In contrast to dynamic NeRF variants such as D-NeRF and 4DGS, SDG performs confidence-aware radar-camera fusion to enhance geometric accuracy and temporal stability. Furthermore, SDG employs detection-aware pruning and tile-based rendering to sustain real-time performance on city-scale scenes. Our key contributions are:
<list list-type="order">
<list-item>
<p><bold>Hybrid static-dynamic Gaussian representation:</bold> Models static backgrounds with 3DGS and dynamic objects with time-dependent parameters, achieving 135 FPS while improving mAP@0.5 by 15.8%, addressing the speed-semantic fidelity trade-off.</p></list-item>
<list-item>
<p><bold>Radar-guided depth refinement:</bold> Fuses sparse radar with monocular depth (MiDaS [<xref ref-type="bibr" rid="ref-6">6</xref>]), enhancing depth estimation and reducing occlusion errors by 22%.</p></list-item>
<list-item>
<p><bold>Detection-aware splatting optimization:</bold> Jointly optimizes Gaussian parameters and detection to prune redundant Gaussians, reducing memory use while maintaining quality for large-scale (&#x003E;1 <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msup><mml:mtext>km</mml:mtext><mml:mn>2</mml:mn></mml:msup></mml:math></inline-formula>) scenes.</p></list-item>
</list></p>
<p>These advances enable photorealistic, interactive urban traffic scene synthesis with significantly reduced complexity&#x2014;from <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mrow><mml:mi>&#x1D4AA;</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>N</mml:mi><mml:mn>3</mml:mn></mml:msup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> in NeRF to <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mrow><mml:mi>&#x1D4AA;</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>N</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>&#x2014;while maintaining high fidelity. On Waymo dataset [<xref ref-type="bibr" rid="ref-7">7</xref>] KITTI and KITTI-360 datasets [<xref ref-type="bibr" rid="ref-8">8</xref>], other benchmarks, SDG outperforms state-of-the-art methods in rendering speed, adaptability, and detection accuracy, demonstrating its potential for large-scale autonomous driving and urban perception applications.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Semantic Perception for Street-Scale Scene Understanding</title>
<p>In recent years, there has been continuous innovation in modeling and rendering dynamic urban environments, showing great potential in domains such as computer vision and computer graphics, particularly for traffic applications. This section reviews key developments in neural scene reconstruction, point cloud-based modeling, scalable hybrid approaches, and radar-vision fusion, highlighting their contributions and limitations in large-scale, real-world applications.</p>
<p>Neural scene representation techniques have revolutionized 3D modeling by leveraging implicit volumetric representations. Neural Radiance Field (NeRF, [<xref ref-type="bibr" rid="ref-1">1</xref>]) introduced a framework for synthesizing photorealistic views of static scenes, and extensions such as Block-NeRF and NeRF&#x002B;&#x002B; [<xref ref-type="bibr" rid="ref-9">9</xref>] improved scalability by partitioning scenes or modeling unbounded depth. However, these methods struggle with temporal dynamics and are computationally prohibitive for real-time use. Dynamic extensions like D-NeRF [<xref ref-type="bibr" rid="ref-10">10</xref>] and Neural Scene Flow Field(NSFF [<xref ref-type="bibr" rid="ref-11">11</xref>]) incorporate motion under steady background assumptions, while Multi-Camera Neural Radiance Fields(MC-NeRF [<xref ref-type="bibr" rid="ref-12">12</xref>]) adapts NeRF for multi-camera outdoor setups to address pose inaccuracies and color inconsistencies. Although effective, they still require long training times and incur high computational costs, limiting applicability to large-scale dynamic environments.</p>
<p>Point clouds provide efficient and interpretable 3D representations. Methods such as PointNet [<xref ref-type="bibr" rid="ref-13">13</xref>] and PointNet&#x002B;&#x002B; [<xref ref-type="bibr" rid="ref-14">14</xref>] pioneered learning-based segmentation and classification from point-based input, but remain primarily static and lack temporal modeling. More recent approaches like 3DGS employ Gaussian representations for photorealistic rendering with high computational efficiency, while dynamic variants [<xref ref-type="bibr" rid="ref-15">15</xref>] extend this concept to motion modeling with local rigidity constraints. Although promising for long-term tracking and dense reconstruction, such approaches remain underexplored in large-scale urban contexts.</p>
<p>Scaling scene representations to large, dynamic environments poses additional challenges. Hybrid solutions such as K-Planes [<xref ref-type="bibr" rid="ref-16">16</xref>] factorize geometry into learnable spatial and temporal planes for improved interpretability and memory efficiency, and StreetSurf [<xref ref-type="bibr" rid="ref-17">17</xref>] introduces multi-shell neural fields for near- and far-view modeling at the urban scale. While techniques like hash grids and cuboid warping enhance rendering quality and speed, integration with dynamic object tracking and sparse sensor data, such as radar, remains largely unresolved.</p>
<p>Radar sensing is increasingly leveraged for robust perception under occlusion and adverse weather. Traditional methods rely on handcrafted features, while newer systems fuse radar and vision for improved object tracking and scene understanding [<xref ref-type="bibr" rid="ref-18">18</xref>]. Despite progress, most focus on specific perception tasks rather than full-scene reconstruction. Attempts to integrate sparse radar data into neural representations for automatic annotation and dynamic reconstruction show potential, but real-time performance at the city scale has yet to be achieved. RCMixer [<xref ref-type="bibr" rid="ref-19">19</xref>] introduces a vision-guided end-to-end radar-camera fusion network, enhancing multi-modal feature alignment for object detection. A dual-view framework combining Perspective View and Bird&#x2019;s Eye View representations [<xref ref-type="bibr" rid="ref-20">20</xref>] enables complementary fusion across spatial domains, improving detection in adverse conditions. Similarly, Enhanced Radar Perception (ERP) [<xref ref-type="bibr" rid="ref-21">21</xref>] leverages multi-task learning to infer radar point height and refine fusion features, while the 2024 survey by Wei et al. [<xref ref-type="bibr" rid="ref-22">22</xref>] summarizes deep-learning-based radar-vision fusion strategies, highlighting that most existing works remain detection-focused and lack full-scene reconstruction. In contrast, our SDG framework integrates radar priors directly into 3D Gaussian scene representations, bridging real-time reconstruction, semantic segmentation, and multi-sensor consistency within a unified architecture.</p>
<p>To further clarify the limitations of existing methods and highlight the novelty of our work, <xref ref-type="table" rid="table-1">Table 1</xref> systematically compares representative approaches with our SDG across key performance metrics.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Comparison of key indicators between existing scene modeling methods and our SDG</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>Real-time performance (FPS)</th>
<th>Dynamic object handling</th>
<th>Occlusion robustness</th>
<th>Multi-modal fusion (Radar/Vision)</th>
</tr>
</thead>
<tbody>
<tr>
<td>NeRF [<xref ref-type="bibr" rid="ref-1">1</xref>]</td>
<td>0.002</td>
<td><inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula></td>
<td>Weak (static scene assumption)</td>
<td>Vision-only</td>
</tr>
<tr>
<td>3DGS [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>63</td>
<td><inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> (no semantic detection)</td>
<td>Weak</td>
<td>Vision-only</td>
</tr>
<tr>
<td>Block-NeRF [<xref ref-type="bibr" rid="ref-2">2</xref>]</td>
<td>0.005</td>
<td><inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula></td>
<td>Weak (occlusion unsolved)</td>
<td>Vision-only</td>
</tr>
<tr>
<td>MARS [<xref ref-type="bibr" rid="ref-23">23</xref>]</td>
<td>0.030</td>
<td>&#x2713; (basic tracking)</td>
<td>Moderate (no radar)</td>
<td>Vision-only</td>
</tr>
<tr>
<td>RCMixer [<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>15</td>
<td>&#x2713; (detection only)</td>
<td>Moderate</td>
<td>Radar-vision</td>
</tr>
<tr>
<td>SDG (Ours)</td>
<td>135</td>
<td>&#x2713;</td>
<td>Strong</td>
<td>Radar-vision</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-1fn1" fn-type="other">
<p>Note: FPS values are tested on the Waymo dataset (1066 <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1600 resolution). Occlusion robustness is measured by &#x201C;detection error rate in occluded regions&#x201D;. Multi-modal fusion is determined by &#x201C;whether non-visual sensors (e.g., radar) are integrated&#x201D;.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>As illustrated in <xref ref-type="table" rid="table-1">Table 1</xref>, three core gaps exist in current dynamic urban scene modeling methods: Trade-off between real-time performance and dynamic handling: 3DGS achieves real-time rendering at 63 FPS but lacks dynamic object detection capabilities; Modular and Realistic Simulator(MARS) supports basic dynamic tracking, yet its reliance on complex volumetric modeling limits the frame rate to only 0.030 FPS&#x2014;far below the real-time requirements of autonomous driving. Insufficient occlusion robustness: Static methods like NeRF and Block-NeRF cannot handle dynamic occlusions due to their static scene assumptions; vision-only methods (3DGS, MARS) suffer from excessively large detection errors in occluded areas. Lack of multi-modal fusion: All compared methods rely solely on visual data (images/LiDAR) and fail to leverage radar&#x2019;s depth stability in adverse conditions, leading to significant depth estimation errors under varying illumination or occlusion.</p>

<p>Traffic scene reconstruction remains challenging due to the complex interplay between static infrastructure and dynamic objects. Traditional multi-view geometry and structure-from-motion methods struggle with temporal inconsistency, while neural approaches such as MARS [<xref ref-type="bibr" rid="ref-23">23</xref>] and StreetSurf incorporate motion disentanglement but still face trade-offs between accuracy and efficiency.</p>
<p>Building upon these advances, our work extends 3DGS toward real-time, radar-guided modeling of dynamic urban environments. By integrating tracked poses, sparse radar depth, and detection-aware optimization, SDGs achieve efficient, high-fidelity reconstruction and semantic perception simultaneously. In contrast to Street Gaussians, which focus on static street rendering, and 4D Gaussian Splatting, which models temporal dynamics for view synthesis, our framework uniquely unifies time-conditioned Gaussians, radar-guided refinement, and detection supervision. This design transforms Gaussian splatting from a rendering-oriented paradigm into a perception-centered framework for real-time urban scene understanding.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Geometric&#x2013;Semantic Inference for 3D Scene Understanding</title>
<p><bold>Scenario-oriented comparison.</bold> We now relate classical geometry, geo-semantic inference, and neural fields to four canonical outdoor layouts&#x2014;curved corridors, alleyways, winding pathways, and deck/platform scenes&#x2014;highlighting assumptions, strengths, weaknesses, and suitability, with citations to representative algorithms in each category. Structure-from-Motion (SfM)/Multi-View Stereo (MVS) and factor-graph Simultaneous Localization and Mapping (SLAM) reconstruct geometry from calibrated views, often regularized by piecewise-planar or layout priors such as Manhattan/Atlanta worlds [<xref ref-type="bibr" rid="ref-24">24</xref>]. Volumetric Truncated Signed Distance Function (TSDF)/voxel or surfel fusion improves closure and scale consistency for dense mapping [<xref ref-type="bibr" rid="ref-25">25</xref>]. These pipelines are interpretable and controllable, with clear error sources, but can be sensitive to scene assumptions (orthogonality/planarity), struggle with dynamics/occlusions, and may incur memory/time costs at the urban scale.</p>
<p>Semantic cues are coupled with geometry via Conditional Random Field(CRF)/Markov Random Field(MRF), Bayesian updates, or graph optimization to enforce layout/object consistency (ground&#x2013;wall&#x2013;opening; lane&#x2013;curb; facade&#x2013;aperture) across space and time [<xref ref-type="bibr" rid="ref-26">26</xref>]. This family is especially effective when functional structure is clear, or appearance is weak/variable, reducing drift and ambiguity. Limitations include dependence on annotation/generalization and the need for robust conflict resolution when semantics and geometry disagree.</p>
<p>NeRF and 3D Gaussian Splatting (3DGS) provide continuous or Gaussian field representations for photorealistic rendering; their dynamic and large-scale variants (incl. 4D formulations) improve temporal modeling and scalability [<xref ref-type="bibr" rid="ref-1">1</xref>,<xref ref-type="bibr" rid="ref-2">2</xref>,<xref ref-type="bibr" rid="ref-4">4</xref>]. However, many works remain view-synthesis-centric, with less integrated supervision for detection/segmentation, and may be brittle under heavy occlusion, adverse weather, or depth instability.</p>
<p>Our SDG-based framework complements the above by routing detection-aware losses through the reconstruction pathway and explicitly fusing mmWave radar with vision, which improves depth/occlusion robustness while unifying reconstruction, segmentation, and detection under real-time constraints. This design targets dynamic street scenes where purely visual neural reconstructions or purely geometric pipelines often degrade.</p>
<p><bold>Curved corridors:</bold> spline/centerline smoothness and relaxed Manhattan <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mo stretchy="false">&#x2192;</mml:mo></mml:math></inline-formula> Atlanta priors stabilize normals and boundaries in non-orthogonal segments [<xref ref-type="bibr" rid="ref-27">27</xref>,<xref ref-type="bibr" rid="ref-28">28</xref>]; semantic layout (wall/floor/doorway) reduces ambiguity in weak textures. <bold>Alleyways:</bold> strong perspective and facade&#x2013;ground decomposition help, with CRF/MRF encouraging facade continuity and curb/step priors; narrow spaces and moving pedestrians can violate static assumptions [<xref ref-type="bibr" rid="ref-29">29</xref>]. <bold>Winding pathways:</bold> clothoid/spline centerline priors maintain long-range consistency [<xref ref-type="bibr" rid="ref-30">30</xref>,<xref ref-type="bibr" rid="ref-31">31</xref>]; slope changes and frequent occlusion challenge vision-only depth, where radar/LiDAR constraints reduce drift. <bold>Deck/platform scenes:</bold> multi-plane and guardrail/boundary-line priors converge quickly for repetitive structures [<xref ref-type="bibr" rid="ref-32">32</xref>,<xref ref-type="bibr" rid="ref-33">33</xref>]; specular surfaces/repetitive textures may confuse matching, calling for semantics or multimodal cues.</p>
<p>These comparisons motivate the SDG design choices in <xref ref-type="sec" rid="s3">Section 3</xref>, where we combine time-conditioned Gaussians, radar-guided refinement, and detection-aware supervision to address dynamics, occlusions, and weak textures across the above scenarios.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Street Detection Gaussians Based Real-Time 3D Scene Representation</title>
<p>In this section, we are going to present our framework that integrates 3DGS for static scene reconstruction, and object detection and segmentation using Grounded Segment Anything (Grounded-SAM, [<xref ref-type="bibr" rid="ref-34">34</xref>,<xref ref-type="bibr" rid="ref-35">35</xref>]). This combined approach reconstructs urban environments and detects dynamic objects using only image-based inputs from the Waymo in real-time.</p>
<p>To address the computational inefficiencies and limited real-time capabilities of the previous approaches, we introduce <italic>SDG</italic>, a novel network designed to efficiently reconstruct and render dynamic urban environments while detecting traffic participants with basic recognition capabilities. This design bridges the gap between existing static reconstruction methods, such as NeRF, and dynamic detection challenges, by combining 3DGS for efficient representation with Grounded-SAM for accurate dynamic segmentation, as depicted in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>(Top) Overview of the proposed Gaussian reconstruction and detection framework. The structure integrates 3DGS for static scene reconstruction and uses Grounded-SAM for object detection and segmentation, enabling real-time modeling of dynamic urban environments. (Bottom) Summary of datasets used in this study</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72544-fig-1.tif"/>
</fig>
<p>This framework efficiently integrates scene reconstruction and object detection into a unified pipeline suitable for real-time applications in urban environments. The notation used throughout the paper is summarized in <xref ref-type="table" rid="table-2">Table 2</xref>. A schematic of the approach is provided in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>, and the algorithm proceeds as follows.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Notations and symbols for equations</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Notation</th>
<th>Meaning</th>
</tr>
</thead>
<tbody>
<tr>
<td><inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>3D position of the <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mi>i</mml:mi></mml:math></inline-formula>-th static Gaussian, <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula></td>
</tr>
<tr>
<td><inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Frobenius regularization loss (controlling Gaussian spread to avoid overfitting)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>F</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula></td>
<td>Set of time-dependent 3D Gaussians for dynamic objects (e.g., vehicles, pedestrians) at time <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mi>t</mml:mi></mml:math></inline-formula></td>
</tr>
<tr>
<td><inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>3D position of the <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>j</mml:mi></mml:math></inline-formula>-th dynamic Gaussian at time <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi>t</mml:mi></mml:math></inline-formula>, <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula></td>
</tr>
<tr>
<td><inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>D</mml:mi><mml:mi>A</mml:mi><mml:mi>R</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Ground truth depth value collected by LiDAR sensor</td>
</tr>
<tr>
<td><inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Weight coefficient of semantic loss (determined as 1.2 via Bayesian optimization)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msubsup><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula></td>
<td>Semantic mask loss (enforcing pixel-level alignment between Grounded-SAM masks and Gaussians)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msubsup><mml:mi>M</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula></td>
<td>Semantic mask value at pixel <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>p</mml:mi></mml:math></inline-formula> generated by Grounded-SAM (applying hard constraints on object boundaries)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mrow><mml:mi>&#x1D4A9;</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Gaussian distribution at pixel <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:mi>p</mml:mi></mml:math></inline-formula> (mean &#x003D; <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:math></inline-formula>, covariance matrix &#x003D; <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:msub></mml:math></inline-formula>)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>M</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Joint probability of scene representation (<inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow></mml:math></inline-formula>) and object detection (<italic>M</italic>)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>M</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Conditional probability of object detection (<italic>M</italic>) given scene representation (<inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow></mml:math></inline-formula>)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>;</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Gaussian kernel at 2D coordinate <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>q</mml:mi></mml:math></inline-formula> (center &#x003D; <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>, covariance matrix &#x003D; <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Normalized weight of the <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mi>i</mml:mi></mml:math></inline-formula>-th Gaussian at 2D coordinate <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mi>q</mml:mi></mml:math></inline-formula> (sum of weights &#x003D; 1)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msub><mml:mi>&#x03D5;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Feature map of original image <italic>I</italic> extracted from the <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>i</mml:mi></mml:math></inline-formula>-th layer of pre-trained network</td>
</tr>
<tr>
<td><inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>&#x03D5;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>K</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Feature map of reconstructed image <italic>K</italic> extracted from the <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>i</mml:mi></mml:math></inline-formula>-th layer of pre-trained network</td>
</tr>
<tr>
<td><inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msub><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula></td>
<td>Gradient-related parameter of the <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>i</mml:mi></mml:math></inline-formula>-th Gaussian (ensuring spatial regularity in regularization)</td>
</tr>
</tbody>
</table>
</table-wrap><fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Schematic diagram of the SDG framework</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72544-fig-2.tif"/>
</fig>
<p><list list-type="order">
<list-item>
<p><bold>Multimodal Data Input and Preprocessing:</bold> Acquire raw data from cameras, LiDAR, and radar. Perform spatiotemporal alignment and convert the data into a unified format, stored as <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mi>D</mml:mi><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>img</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>lidar</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mtext>text</mml:mtext></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula>.</p></list-item>
<list-item>
<p><bold>Static-Dynamic Dual Gaussian Representation</bold>
<list list-type="simple">
<list-item><label>(a)</label><p><bold>Static Gaussian Modeling:</bold> Segment static regions, initialize 3D Gaussians via K-means&#x002B;&#x002B;, and optimize by minimizing the pixel loss <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>rgb</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> through gradient descent.</p></list-item>
<list-item><label>(b)</label><p><bold>Dynamic Gaussian Modeling:</bold> Detect dynamic objects, initialize 3D Gaussians, and update them with temporal tracking using optical flow and Kalman filtering.</p></list-item>
</list></p></list-item>
<list-item>
<p><bold>Radar Guidance Fusion and Semantic Detection:</bold> Extract radar features, fuse them with the data represented by Gaussians, and use a pre-trained model to identify entity relationships and extract contextual information.</p></list-item>
<list-item>
<p><bold>Semantic-Aware Optimization and Real-Time Rendering:</bold> Introduce the semantic loss <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> for joint optimization with <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>rgb</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula>, eliminate noise with an adaptive algorithm, and perform real-time rendering using a differentiable renderer and GPU parallel computing.</p></list-item>
</list></p>
<sec id="s3_1">
<label>3.1</label>
<title>Static Background Representation and Reconstruction</title>
<p>To achieve efficient and high-fidelity environmental reconstruction, static background elements, such as roads and buildings, are modeled using a set of 3D Gaussian distribution, where each Gaussian is parameterized as follows:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mi>B</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></disp-formula>where <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mn>3</mml:mn></mml:msup></mml:math></inline-formula> represents the 3D position of Gaussian <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>i</mml:mi></mml:math></inline-formula>, <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> defines the covariance matrix, modeling spatial uncertainty, and <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow></mml:math></inline-formula> denotes the intensity (appearance) of the Gaussian.</p>
<p>The depth <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mi>D</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and radiance <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> at each pixel <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> are estimated through multi-view stereo reconstruction by aggregating contributions from all 3D Gaussians projected onto the image plane. Each Gaussian is parameterized by a mean <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula> and a covariance matrix <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula>, which jointly determine its influence on nearby pixels via a spatial Gaussian kernel.</p>
<p>Specifically, <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>D</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the expected depth at pixel <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, computed as a Gaussian-weighted average over the individual depths <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:msub><mml:mi>z</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula> of each component. Similarly, <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the radiance (or intensity), aggregated from the per-Gaussian intensity values <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:msub><mml:mi>I</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula> using the same weighting. This approach enables smooth and differentiable rendering by softly blending the contributions of overlapping 3D Gaussians.</p>
<p>Our implementation follows the differentiable splatting mechanism described by Kerbl et al. [<xref ref-type="bibr" rid="ref-4">4</xref>], which supports real-time rendering while maintaining photometric and geometric consistency across views.</p>
<p>To further enhance the reconstruction efficiency, we apply a Frobenius-norm regularization term to control the Gaussian spread and prevent overfitting:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mrow><mml:mtext>reg</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msubsup></mml:math></disp-formula></p>
<p>The regularization loss <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>reg</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msubsup></mml:math></inline-formula> constrains the spread of each Gaussian by penalizing the Frobenius norm of its covariance matrix <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>, where <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mi>F</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msqrt><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:msubsup></mml:msqrt></mml:math></inline-formula> measures the spatial dispersion. This regularization encourages compact Gaussians, thereby improving static scene fidelity and maintaining computational efficiency for real-time rendering.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Dynamic Object Detection and Segmentation</title>
<p>Dynamic objects, such as vehicles and pedestrians, are detected using Grounded-SAM. The process begins with input from the reconstructed depth maps <inline-formula id="ieqn-67"><mml:math id="mml-ieqn-67"><mml:mrow><mml:mtext mathvariant="bold">D</mml:mtext></mml:mrow></mml:math></inline-formula> and radiance fields <inline-formula id="ieqn-68"><mml:math id="mml-ieqn-68"><mml:mrow><mml:mi>&#x211B;</mml:mi></mml:mrow></mml:math></inline-formula>. The steps are as follows:
<list list-type="order">
<list-item>
<p><bold>Depth-Based Proposals:</bold> Object proposals are generated by clustering regions in <inline-formula id="ieqn-69"><mml:math id="mml-ieqn-69"><mml:mrow><mml:mtext mathvariant="bold">D</mml:mtext></mml:mrow></mml:math></inline-formula> where depth gradients exceed a threshold [<xref ref-type="bibr" rid="ref-36">36</xref>]. This step is important for segmenting regions of the scene that exhibit noticeable changes in depth, as these typically correspond to objects of interest, such as vehicles or pedestrians. Depth-gradient thresholding focuses on the most prominent structures in the scene. Here <inline-formula id="ieqn-70"><mml:math id="mml-ieqn-70"><mml:msub><mml:mrow><mml:mi>&#x1D4AB;</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> represents the proposed regions, <inline-formula id="ieqn-71"><mml:math id="mml-ieqn-71"><mml:mi>&#x03B4;</mml:mi></mml:math></inline-formula> is the gradient threshold, and <inline-formula id="ieqn-72"><mml:math id="mml-ieqn-72"><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mi>D</mml:mi></mml:math></inline-formula> is computed using Sobel filtering, as detailed in [<xref ref-type="bibr" rid="ref-37">37</xref>].</p></list-item>
<list-item>
<p><bold>Grounded-SAM Detection:</bold> The radiance field <inline-formula id="ieqn-73"><mml:math id="mml-ieqn-73"><mml:mrow><mml:mi>&#x211B;</mml:mi></mml:mrow></mml:math></inline-formula> and object proposals <inline-formula id="ieqn-74"><mml:math id="mml-ieqn-74"><mml:mrow><mml:mi>&#x1D4AB;</mml:mi></mml:mrow></mml:math></inline-formula> are passed to Grounded-SAM for object detection and segmentation. This method utilizes a text-driven approach to detect and segment objects based on the object proposals, generating bounding boxes <inline-formula id="ieqn-75"><mml:math id="mml-ieqn-75"><mml:mrow><mml:mi>&#x0212C;</mml:mi></mml:mrow></mml:math></inline-formula> and segmentation masks <inline-formula id="ieqn-76"><mml:math id="mml-ieqn-76"><mml:mrow><mml:mi>&#x02133;</mml:mi></mml:mrow></mml:math></inline-formula> for each detected object. This step helps to localize and segment moving objects, which is crucial for subsequent tracking. The detection and segmentation process follows the framework of Grounded-SAM as described in [<xref ref-type="bibr" rid="ref-34">34</xref>,<xref ref-type="bibr" rid="ref-35">35</xref>].</p></list-item>
<list-item>
<p><bold>Depth Association:</bold> Detected objects are associated with their 3D positions using the depth map <inline-formula id="ieqn-77"><mml:math id="mml-ieqn-77"><mml:mrow><mml:mtext mathvariant="bold">D</mml:mtext></mml:mrow></mml:math></inline-formula>. For each object <inline-formula id="ieqn-78"><mml:math id="mml-ieqn-78"><mml:mi>j</mml:mi></mml:math></inline-formula>, we estimate its spatial centroid <inline-formula id="ieqn-79"><mml:math id="mml-ieqn-79"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:math></inline-formula> by minimizing the difference between its projected position and local depth values. This depth-guided association improves localization accuracy and provides contextual alignment for downstream tracking [<xref ref-type="bibr" rid="ref-38">38</xref>].</p></list-item>
<list-item>
<p><bold>Monocular Depth Refinement:</bold> Depth is further refined using a monocular depth model [<xref ref-type="bibr" rid="ref-6">6</xref>] such as MiDaS. The initial depth estimation can often be imprecise due to the complexity of the scene or sensor limitations. By using a monocular depth model, we improve the accuracy of the depth information, which is crucial for better object tracking and segmentation in dynamic urban environments. Refining the depth estimates with monocular cues helps to align the 3D models more accurately with the actual scene, especially in cases where stereo or LiDAR data may be sparse or noisy.</p></list-item>
</list></p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Dynamic Object Representation and Tracking</title>
<p>To model and track moving objects within the urban environment, we represent dynamic entities using time-dependent 3D Gaussian distributions. Each detected object is parameterized as follows:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>F</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:munder><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msubsup></mml:math></disp-formula>where <inline-formula id="ieqn-80"><mml:math id="mml-ieqn-80"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mn>3</mml:mn></mml:msup></mml:math></inline-formula> represents the 3D position of the object <inline-formula id="ieqn-81"><mml:math id="mml-ieqn-81"><mml:mi>j</mml:mi></mml:math></inline-formula> at time <inline-formula id="ieqn-82"><mml:math id="mml-ieqn-82"><mml:mi>t</mml:mi></mml:math></inline-formula>, <inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> defines the uncertainty of the shape of the covariance matrix modeling the object, <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:msub><mml:mi>I</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow></mml:math></inline-formula> denotes the intensity (appearance) of object <inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:mi>j</mml:mi></mml:math></inline-formula>, and <italic>M</italic> is the total number of dynamic objects detected.</p>
<p>To track dynamic objects across frames, we adopt a feature-based optical flow method combined with Kalman filtering. Specifically, we estimate the displacement vector <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:msub><mml:mi>F</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mn>3</mml:mn></mml:msup></mml:math></inline-formula> using optical flow, and then apply Kalman filtering to smooth the predicted trajectories and suppress noise, yielding the refined motion estimate <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:msub><mml:mi>u</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>.</p>
<p>We integrate depth-based association to enhance the temporal consistency between frames. For each object <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:mi>j</mml:mi></mml:math></inline-formula> at time <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mi>t</mml:mi></mml:math></inline-formula>, we calculate a match cost with the candidates at <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mi>t</mml:mi><mml:mrow><mml:mo>+</mml:mo></mml:mrow><mml:mn>1</mml:mn></mml:math></inline-formula> based on the spatial displacement between the centroids and the covariance differences. A weighting factor <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> balances these two terms, enabling a robust association of dynamic objects across frames.</p>
<p>To manage occlusions in dense traffic environments, we implement an adaptive re-initialization strategy. The predicted position <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:msubsup><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mtext>pred</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mrow><mml:mo>+</mml:mo></mml:mrow><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> of an occluded object is calculated as a weighted combination of its last reliable observation <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:msubsup><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mtext>prev</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and its current estimate <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, using a confidence factor <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> to balance temporal consistency. If an object remains undetected for more than <inline-formula id="ieqn-96"><mml:math id="mml-ieqn-96"><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mtext>frames</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula>, it is removed from the tracking pool.</p>
<p>To ensure smooth object trajectories over time, we introduce a temporal regularization loss that penalizes inconsistent motion estimates between consecutive frames. Specifically, for each object <inline-formula id="ieqn-97"><mml:math id="mml-ieqn-97"><mml:mi>j</mml:mi></mml:math></inline-formula>, the predicted position <inline-formula id="ieqn-98"><mml:math id="mml-ieqn-98"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> at time <inline-formula id="ieqn-99"><mml:math id="mml-ieqn-99"><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> is encouraged to follow the expected displacement <inline-formula id="ieqn-100"><mml:math id="mml-ieqn-100"><mml:msub><mml:mi>F</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> from the previous position <inline-formula id="ieqn-101"><mml:math id="mml-ieqn-101"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. The discrepancy between the predicted and expected position changes is measured and accumulated over the entire sequence length <italic>T</italic>. This regularization discourages abrupt deviations in the trajectory and promotes temporal coherence, which is particularly important for robust tracking in dynamic urban environments.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Semantic-Aware Optimization of Reconstruction &#x0026; Detection</title>
<p>We propose a unified loss framework that balances photorealism, geometric accuracy, and semantic consistency through three complementary objectives:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mrow><mml:mtext>rend</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>&#x23DF;</mml:mo></mml:munder></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>rgb</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mrow><mml:mtext>rend</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mrow><mml:mtext>lidar</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>&#x23DF;</mml:mo></mml:munder></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:munder><mml:mrow><mml:munder><mml:msubsup><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mtext>(mask)</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x23DF;</mml:mo></mml:munder></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:math></disp-formula>where <inline-formula id="ieqn-102"><mml:math id="mml-ieqn-102"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.8</mml:mn></mml:math></inline-formula>, <inline-formula id="ieqn-103"><mml:math id="mml-ieqn-103"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mn>1.2</mml:mn></mml:mrow></mml:math></inline-formula> which was determined via Bayesian optimization over 500 iterations.</p>
<p>The core innovation lies in <inline-formula id="ieqn-104"><mml:math id="mml-ieqn-104"><mml:msubsup><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow><mml:mrow><mml:mtext>(mask)</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula>, which enforces pixel-wise mask-Gaussian alignment through:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:msubsup><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mtext>(mask)</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="normal">&#x03A9;</mml:mi></mml:mrow></mml:munder><mml:msubsup><mml:mrow><mml:mtext mathvariant="bold">M</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>SAM</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x1D4B1;</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mrow><mml:mi>&#x1D4A9;</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext mathvariant="bold">p</mml:mtext></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mtext mathvariant="bold">x</mml:mtext></mml:mrow><mml:mi>j</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-105"><mml:math id="mml-ieqn-105"><mml:msup><mml:mrow><mml:mi>&#x1D4B1;</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula> denotes Gaussians visible in the pixel <inline-formula id="ieqn-106"><mml:math id="mml-ieqn-106"><mml:mi>p</mml:mi></mml:math></inline-formula>. The depth loss <inline-formula id="ieqn-107"><mml:math id="mml-ieqn-107"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> uses LiDAR measurements:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x03A9;</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="normal">&#x03A9;</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mrow><mml:mtext>rend</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mrow><mml:mtext>LiDAR</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>This term ensures that only Gaussians visible in pixel <inline-formula id="ieqn-108"><mml:math id="mml-ieqn-108"><mml:mi>p</mml:mi></mml:math></inline-formula> contribute to the rendering, while the <inline-formula id="ieqn-109"><mml:math id="mml-ieqn-109"><mml:msubsup><mml:mrow><mml:mtext mathvariant="bold">M</mml:mtext></mml:mrow><mml:mrow><mml:mtext>SAM</mml:mtext></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula> term imposes hard constraints on the boundaries of the object.</p>
<p>To handle dynamic objects, we implement two critical optimizations:
<list list-type="order">
<list-item>
<p>Mask propagation: When a Gaussian is occluded (<inline-formula id="ieqn-110"><mml:math id="mml-ieqn-110"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>&#x003C;</mml:mo><mml:mn>0.05</mml:mn></mml:math></inline-formula>), its mask is inherited by the neighboring Gaussians with similarity score <inline-formula id="ieqn-111"><mml:math id="mml-ieqn-111"><mml:mo>&#x2265;</mml:mo><mml:mn>0.75</mml:mn></mml:math></inline-formula> (computed via the Chamfer distance).</p></list-item>
<list-item>
<p>Temporal smoothing: Apply a 3-frame moving average to SAM masks to reduce detection jitter.</p></list-item>
</list></p>
<p><xref ref-type="table" rid="table-3">Table 3</xref> shows the impact of each loss component:</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Ablation study on loss components</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Configuration</th>
<th>PSNR</th>
<th>mAP</th>
<th>FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td>No semantic loss</td>
<td>33.1</td>
<td>58.3</td>
<td>89</td>
</tr>
<tr>
<td>With semantic loss</td>
<td><bold>34.9</bold></td>
<td><bold>67.2</bold></td>
<td>82</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-3fn1" fn-type="other">
<p>Note: Bold results indicate the best performance in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Removing the semantic term causes:
<list list-type="bullet">
<list-item>
<p>14.8% reduction in Mean Average Precision (mAP)@0.5 (from 67.2 to 56.4)</p></list-item>
<list-item>
<p>2.3% Peak Signal-to-Noise Ratio (PSNR) (PSNR) drop (34.9 to 32.6)</p></list-item>
<list-item>
<p>Enables 7 FPS speedup by disabling mask constraints.</p></list-item>
</list></p>
<p>The proposed loss can be viewed as a variational lower bound on the joint probability of scene representation and object detection:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext mathvariant="bold">M</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext mathvariant="bold">M</mml:mtext></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x221D;</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>rgb</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>here, <inline-formula id="ieqn-112"><mml:math id="mml-ieqn-112"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the probability of the scene representation (3D Gaussian distribution), <inline-formula id="ieqn-113"><mml:math id="mml-ieqn-113"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext mathvariant="bold">M</mml:mtext></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x1D4A2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the conditional probability of object detection given the scene representation, and the terms <inline-formula id="ieqn-114"><mml:math id="mml-ieqn-114"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>rgb</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-115"><mml:math id="mml-ieqn-115"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula id="ieqn-116"><mml:math id="mml-ieqn-116"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> represent the photometric, depth, and semantic losses, respectively. By maximizing this posterior, we achieve both accurate reconstruction and consistent detection.</p>
<p>To enhance depth accuracy and robustness in complex urban scenes, we adopt a radar-guided refinement strategy. Radar measurements provide sparse but geometrically reliable depth cues, which are projected to the image domain and used to guide the refinement of visual depth predictions. During feature fusion, radar and visual features are aligned according to their geometric correspondence, and a lightweight gating mechanism adaptively balances the two sources. When visual cues are degraded by lighting or motion, radar information dominates; otherwise, visual details are preserved. This simple yet effective design improves geometric consistency without adding extra modules.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Scene Rendering and Visualization</title>
<p>The final reconstructed urban scene is rendered by projecting both static and dynamic 3D Gaussians onto the 2D image plane. This process involves transforming each Gaussian&#x2019;s 3D position into screen space using the camera intrinsic matrix <italic>K</italic>, rotation matrix <italic>R</italic>, and translation vector <inline-formula id="ieqn-117"><mml:math id="mml-ieqn-117"><mml:mi>t</mml:mi></mml:math></inline-formula> through the transformation equation:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mi>K</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:mi>R</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:mi>&#x03BC;</mml:mi><mml:mo>+</mml:mo><mml:mi>t</mml:mi></mml:math></disp-formula>where <inline-formula id="ieqn-118"><mml:math id="mml-ieqn-118"><mml:mi>&#x03BC;</mml:mi></mml:math></inline-formula> represents the 3D position of the Gaussian and <inline-formula id="ieqn-119"><mml:math id="mml-ieqn-119"><mml:mi>q</mml:mi></mml:math></inline-formula> denotes the projected 2D coordinate. Each Gaussian&#x2019;s contribution to the rendered intensity is computed based on its spatial distribution and opacity as follows:
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-120"><mml:math id="mml-ieqn-120"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the Gaussian kernel centered at <inline-formula id="ieqn-121"><mml:math id="mml-ieqn-121"><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> with covariance <inline-formula id="ieqn-122"><mml:math id="mml-ieqn-122"><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>, and <inline-formula id="ieqn-123"><mml:math id="mml-ieqn-123"><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> is the corresponding intensity value. To enhance depth perception and realism, a depth-weighted blending function is applied during rendering, ensuring that closer objects occlude further ones:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mrow><mml:mtext>final</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mspace width="1em" /><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>;</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:munder><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>;</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>&#x03BC;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:math></disp-formula>where <inline-formula id="ieqn-124"><mml:math id="mml-ieqn-124"><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes the normalized weight of each Gaussian at pixel <inline-formula id="ieqn-125"><mml:math id="mml-ieqn-125"><mml:mi>q</mml:mi></mml:math></inline-formula>. Additionally, dynamic objects are separately composited using motion-aware temporal filtering, which smooths rapid movements by applying a temporal exponential decay to the previous frame: <inline-formula id="ieqn-126"><mml:math id="mml-ieqn-126"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>final</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>final</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo></mml:mrow><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-127"><mml:math id="mml-ieqn-127"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> is a smoothing factor that controls frame-to-frame consistency. This ensures that objects retain motion blur while preserving sharpness.</p>
<p>To further improve rendering efficiency, adaptive resolution upsampling is employed, leveraging multi-scale Gaussian sampling to dynamically refine high-detail areas while reducing computational overhead in less critical regions. The final rendered frames are then visualized with overlaid segmentation masks, derived from Grounded-SAM detection, allowing for real-time interaction with the reconstructed scene and facilitating urban traffic analysis.</p>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Gaussian Parameter Optimization with Dual Loss</title>
<p>The parameters of both static and dynamic Gaussians are optimized jointly using a combination of photometric loss and regularization [<xref ref-type="bibr" rid="ref-39">39</xref>]. The total loss function is defined as:
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mrow><mml:mtext>rendered</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>&#x23DF;</mml:mo></mml:munder></mml:mrow><mml:mrow><mml:mrow><mml:mtext>Photometric</mml:mtext></mml:mrow></mml:mrow></mml:munder><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>F</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x23DF;</mml:mo></mml:munder></mml:mrow><mml:mrow><mml:mrow><mml:mtext>Regularization</mml:mtext></mml:mrow></mml:mrow></mml:munder></mml:math></disp-formula></p>
<p>The total loss function combines a photometric term and a spatial regularization term: <inline-formula id="ieqn-128"><mml:math id="mml-ieqn-128"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mover><mml:mi>I</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:msub><mml:msubsup><mml:mrow><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msubsup></mml:math></inline-formula>, where <inline-formula id="ieqn-129"><mml:math id="mml-ieqn-129"><mml:mrow><mml:mover><mml:mi>I</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> is the rendered image, <inline-formula id="ieqn-130"><mml:math id="mml-ieqn-130"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> is the ground truth, and <inline-formula id="ieqn-131"><mml:math id="mml-ieqn-131"><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> denotes the covariance matrix of Gaussian <inline-formula id="ieqn-132"><mml:math id="mml-ieqn-132"><mml:mi>i</mml:mi></mml:math></inline-formula>. The photometric loss enforces pixel-level consistency with the target image, while the regularization term constrains the spatial spread of Gaussians, encouraging compact and stable representations. The trade-off is controlled by a weighting factor <inline-formula id="ieqn-133"><mml:math id="mml-ieqn-133"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula>, which balances visual fidelity and smoothness. This design follows best practices in differentiable Gaussian rendering [<xref ref-type="bibr" rid="ref-4">4</xref>].</p>
<p><bold>Explanation of <inline-formula id="ieqn-134"><mml:math id="mml-ieqn-134"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula>:</bold> The hyperparameter <inline-formula id="ieqn-135"><mml:math id="mml-ieqn-135"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> must be tuned based on the specific application and dataset. For example, in scenarios with noisy input data, a larger <inline-formula id="ieqn-136"><mml:math id="mml-ieqn-136"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> helps reduce noise by prioritizing smoothness. Conversely, for highly detailed reconstructions, a smaller <inline-formula id="ieqn-137"><mml:math id="mml-ieqn-137"><mml:mi>&#x03BB;</mml:mi></mml:math></inline-formula> is preferable to emphasize photometric accuracy.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental Evaluation</title>
<p>Understanding our approach&#x2019;s performance in real-world scenarios is crucial for validating its effectiveness. In this section, we present a comprehensive evaluation of our method, comparing it against existing techniques using publicly available datasets. We assess key aspects such as reconstruction accuracy, object detection performance, computational efficiency, and scalability in large-scale urban environments.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset Description</title>
<p>For this study, we use datasets suitable for reconstructing and analyzing dynamic urban environments. Therefore we utilize the Waymo Open Dataset, which provides large-scale multi-view imagery and LiDAR data, enabling both high-fidelity 3D reconstruction and accurate object detection. This dataset is chosen for its diverse urban scenarios, including varying lighting conditions, traffic densities, and occlusions. The datasets provide synchronized camera and LiDAR data, enabling the generation of depth maps and radiance fields for static and dynamic object modeling.</p>
<p>These datasets enable the evaluation of our method&#x2019;s performance under various scenarios, including traffic reconstruction and dynamic object tracking.</p>
<p>We assess our model&#x2019;s performance using several key metrics. For 3D scene reconstruction, we measure PSNR and Structural Similarity Index Measure (SSIM, [<xref ref-type="bibr" rid="ref-40">40</xref>]) to quantify rendering fidelity. Additionally, the perceptual loss based on a pre-trained Visual Geometry Group (VGG network [<xref ref-type="bibr" rid="ref-41">41</xref>]) is computed to capture high-level feature consistency. For dynamic object detection, we report mAP with an Intersection over Union (IoU) threshold of 0.5, as well as IoU scores to evaluate object localization accuracy. Finally, we analyze computational efficiency, comparing the frames per second (FPS) across multiple baselines, ensuring the real-time feasibility of our approach.</p>
<p>The Waymo Open Dataset is a comprehensive collection of autonomous driving data, featuring synchronized high-resolution camera and LiDAR data from self-driving vehicles. It includes 3D point cloud sequences that support object detection, shape reconstruction, and tracking. For our experiments, we use sequences containing dynamic traffic scenarios with multiple moving vehicles and pedestrians. The KITTI and KITTI-360 datasets [<xref ref-type="bibr" rid="ref-8">8</xref>] are additionally employed for broader validation.</p>
<sec id="s4_1_1">
<label>4.1.1</label>
<title>Reconstruction Metrics</title>
<p>Structural Similarity Index Measure (SSIM, [<xref ref-type="bibr" rid="ref-40">40</xref>]): Measures the similarity between two images by comparing luminance, contrast, and structure. SSIM values range from &#x2212;1 to 1, with higher values indicating greater similarity.</p>
<p>PSNR: Quantifies the reconstruction quality by measuring the ratio between the maximum pixel value and the mean squared error (MSE):
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mrow><mml:mtext>PSNR</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>10</mml:mn><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>log</mml:mi><mml:mrow><mml:mn>10</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:msup><mml:mrow><mml:mtext>MAX</mml:mtext></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mrow><mml:mtext>MSE</mml:mtext></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>Higher PSNR values indicate better quality.</p>
<p>Perceptual Loss: Compares high-level features extracted from a pre-trained neural network, such as VGG, to evaluate perceptual similarity between original and reconstructed images:
<disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>perceptual</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03D5;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03D5;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>K</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msubsup><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where <inline-formula id="ieqn-138"><mml:math id="mml-ieqn-138"><mml:msub><mml:mi>&#x03D5;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> represents feature maps of the <inline-formula id="ieqn-139"><mml:math id="mml-ieqn-139"><mml:mi>i</mml:mi></mml:math></inline-formula>-th network layer. <italic>I</italic> is the original image, and <italic>K</italic> is the reconstructed image. These images are compared by extracting their high-level features through a pre-trained neural network, such as VGG, to evaluate the perceptual similarity between them.</p>
</sec>
<sec id="s4_1_2">
<label>4.1.2</label>
<title>Detection Metrics</title>
<p>Mean Average Precision (mAP): Measures object detection accuracy by averaging precision-recall across all classes. Higher mAP values indicate better detection performance. Intersection over Union (IoU): Evaluates localization accuracy by calculating the overlap between predicted and ground truth bounding boxes.</p>
</sec>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental Setup</title>
<p><bold>Preprocessing.</bold> We preprocess the Waymo dataset by synchronizing LiDAR and camera data to derive depth maps and sparse 3D point clouds. Additional refinement steps include applying monocular depth estimation (via MiDaS) to improve depth consistency and aligning camera poses for accurate Gaussian initialization.</p>
<p><bold>Baseline and Framework.</bold> Our method builds upon the Street Gaussians framework for 3D reconstruction and introduces the following enhancements:
<list list-type="bullet">
<list-item>
<p>Radar-assisted annotation for dynamic object association.</p></list-item>
<list-item>
<p>Temporal smoothing to enhance frame consistency.</p></list-item>
<list-item>
<p>Cubemap-based sky modeling to refine static scene representation.</p></list-item>
<list-item>
<p>Optimized Gaussian parameters using the loss function:</p></list-item>
</list></p>
<p><disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>photo</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>reg</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where <inline-formula id="ieqn-140"><mml:math id="mml-ieqn-140"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>photo</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mrow><mml:mover><mml:mi>I</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup></mml:math></inline-formula> evaluates photometric consistency and <inline-formula id="ieqn-141"><mml:math id="mml-ieqn-141"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mrow><mml:mtext>reg</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:msub><mml:mi mathvariant="normal">&#x03A3;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msubsup></mml:math></inline-formula> ensures spatial regularity.</p>
<p><bold>Object Detection and Segmentation.</bold> We integrate Grounded-SAM for object detection, which utilizes text-based prompts (&#x201C;car&#x201D;, &#x201C;pedestrian&#x201D;) to generate bounding boxes and segmentation masks. These masks are associated with 3D positions derived from depth maps for accurate dynamic object tracking.</p>
<p><bold>Comparison Methods.</bold> Baselines include 3DGS, MARS, and Street Gaussians, covering both static reconstruction and dynamic detection benchmarks.</p>
<p><bold>Implementation.</bold> To measure the performance of proposed approach and benchmarks, we used a device setup based on NVIDIA A100 GPUs with 40 GB memory. Rendering resolutions are set to <inline-formula id="ieqn-142"><mml:math id="mml-ieqn-142"><mml:mn>1066</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1600</mml:mn></mml:math></inline-formula> (Waymo) and <inline-formula id="ieqn-143"><mml:math id="mml-ieqn-143"><mml:mn>375</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1242</mml:mn></mml:math></inline-formula> KITTI, while FPS is measured under real-time constraints.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Results</title>
<p>We compare our method against state-of-the-art baselines, including NeRF, 3DGS, and Street Gaussians. <xref ref-type="table" rid="table-4">Table 4</xref> summarizes the rendering speed, demonstrating that our method achieves a significant performance boost while maintaining high rendering quality.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Comparison of rendering speed and quality across different methods</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>PSNR</th>
<th>SSIM</th>
<th>FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td>NeRF</td>
<td>27.3</td>
<td>0.841</td>
<td>0.002</td>
</tr>
<tr>
<td>3DGS</td>
<td>30.1</td>
<td>0.892</td>
<td>63.0</td>
</tr>
<tr>
<td>Street Gaussians</td>
<td>34.6</td>
<td>0.938</td>
<td>125.0</td>
</tr>
<tr>
<td>Ours</td>
<td><bold>34.9</bold></td>
<td><bold>0.940</bold></td>
<td><bold>135.0</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-4fn1" fn-type="other">
<p>Note: Bold results indicate the best performance in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Our approach outperforms previous methods in both rendering quality and speed. Notably, our model achieves <inline-formula id="ieqn-144"><mml:math id="mml-ieqn-144"><mml:mn>2.14</mml:mn><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> faster inference than Street Gaussians while maintaining higher PSNR and SSIM values. Additionally, for dynamic object detection, our method surpasses 3DGS by 25% in mAP, indicating improved tracking and segmentation capabilities.</p>
<p><xref ref-type="table" rid="table-5">Table 5</xref> compares the FPS across datasets, demonstrating our method&#x2019;s real-time rendering capabilities. Our method outperforms others in reconstruction quality, as shown in <xref ref-type="table" rid="table-6">Table 6</xref>. <xref ref-type="fig" rid="fig-3">Fig. 3</xref> visually illustrates our approach&#x2019;s capability to retain structural details and achieve lower perceptual loss. This highlights the robustness of our enhanced Gaussian representation in static background reconstruction.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Rendering speed comparison across datasets (FPS)</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>Waymo</th>
<th>KITTI</th>
<th>KITTI-360</th>
</tr>
</thead>
<tbody>
<tr>
<td>MC-NeRF</td>
<td>0.0014</td>
<td>0.0075</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>3DGS</td>
<td>63.0</td>
<td>125.0</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>MARS</td>
<td>0.030</td>
<td>0.31</td>
<td>0.25</td>
</tr>
<tr>
<td>Ours</td>
<td><bold>135.0</bold></td>
<td><bold>59.0</bold></td>
<td><bold>50.0</bold></td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Reconstruction quality comparison across datasets. Metrics include PSNR, SSIM, and Learned Perceptual Image Patch Similarity (LPIPS)</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Dataset</th>
<th colspan="3">Waymo</th>
<th colspan="3">KITTI</th>
<th colspan="3">KITTI-360</th>
</tr>
</thead>
<tbody>
<tr>
<td><bold>Method</bold></td>
<td>PSNR</td>
<td>SSIM</td>
<td>LPIPS</td>
<td>PSNR</td>
<td>SSIM</td>
<td>LPIPS</td>
<td>PSNR</td>
<td>SSIM</td>
<td>LPIPS</td>
</tr>
<tr>
<td>3DGS</td>
<td>27.99</td>
<td>0.866</td>
<td>0.293</td>
<td>21.02</td>
<td>0.811</td>
<td>0.202</td>
<td>22.78</td>
<td>0.793</td>
<td>0.176</td>
</tr>
<tr>
<td>NSG [<xref ref-type="bibr" rid="ref-13">13</xref>]</td>
<td>24.08</td>
<td>0.656</td>
<td>0.441</td>
<td>26.66</td>
<td>0.876</td>
<td>0.185</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>MARS</td>
<td>28.83</td>
<td>0.681</td>
<td>0.430</td>
<td>27.96</td>
<td>0.900</td>
<td>0.185</td>
<td>23.09</td>
<td>0.857</td>
<td>0.174</td>
</tr>
<tr>
<td>Street Gaussians [<xref ref-type="bibr" rid="ref-42">42</xref>]</td>
<td>34.61</td>
<td>0.938</td>
<td>0.079</td>
<td>31.54</td>
<td>0.927</td>
<td>0.083</td>
<td>23.81</td>
<td>0.832</td>
<td>0.155</td>
</tr>
<tr>
<td>Hugs [<xref ref-type="bibr" rid="ref-43">43</xref>]</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>28.78</td>
<td>0.928</td>
<td>0.023</td>
<td>23.38</td>
<td>0.870</td>
<td>0.121</td>
</tr>
<tr>
<td>Evolsplat [<xref ref-type="bibr" rid="ref-44">44</xref>]</td>
<td>24.43</td>
<td>0.786</td>
<td>0.202</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>23.26</td>
<td>0.797</td>
<td>0.179</td>
</tr>
<tr>
<td>Vegs [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>24.77</td>
<td>0.845</td>
<td>0.113</td>
<td>23.71</td>
<td>0.812</td>
<td>0.106</td>
</tr>
<tr>
<td>Ours</td>
<td><bold>34.92</bold></td>
<td><bold>0.940</bold></td>
<td>0.078</td>
<td><bold>31.76</bold></td>
<td><bold>0.929</bold></td>
<td><bold>0.080</bold></td>
<td><bold>23.83</bold></td>
<td><bold>0.834</bold></td>
<td><bold>0.153</bold></td>
</tr>
</tbody>
</table>
</table-wrap><fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Qualitative results of reconstruction across different datasets. Visualization shows the effectiveness of our method in retaining structural details and reducing perceptual loss</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72544-fig-3.tif"/>
</fig>
<p>Our results, as shown in <xref ref-type="table" rid="table-5">Tables 5</xref> and <xref ref-type="table" rid="table-6">6</xref>, highlight the robustness of our method. Specifically, our method achieves a PSNR of 34.92 dB and SSIM of 0.940 on Waymo, surpassing Street Gaussians by 0.31 dB and 0.002, respectively. Additionally, the rendering speed of 135 FPS on waymo dataset is more than twice that of Street Gaussians (63 FPS), demonstrating the efficiency of our optimized Gaussian parameterization. These metrics validate the scalability and the real-time capability of our framework in dynamic urban scenarios. Furthermore, our experiments on the KITTI-360 dataset cover a continuous city-scale trajectory exceeding 80 km across Karlsruhe, corresponding to an urban area of over 5 <inline-formula id="ieqn-145"><mml:math id="mml-ieqn-145"><mml:msup><mml:mtext>km</mml:mtext><mml:mn>2</mml:mn></mml:msup></mml:math></inline-formula>, which is substantially larger than conventional KITTI odometry sequences (&#x003C;1 <inline-formula id="ieqn-146"><mml:math id="mml-ieqn-146"><mml:msup><mml:mtext>km</mml:mtext><mml:mn>2</mml:mn></mml:msup></mml:math></inline-formula>). This large-scale evaluation further demonstrates the effectiveness and scalability of our method in realistic urban environments.</p>

<p>Although Grounded-SAM is a general-purpose segmentation model, it performs reliably in structured urban scenes after adaptation. In our framework, it is prompted with traffic-related categories (vehicles, pedestrians, traffic signs, etc.) to focus on road-relevant objects. The grounding module supports text-guided detection, while the SAM backbone ensures accurate masks under illumination changes and partial occlusions. To improve stability, temporal filtering and geometric consistency checks between consecutive frames are applied to suppress spurious detections. Preliminary observations show that the model maintains stable segmentation quality across different viewpoint conditions, indicating its robustness and potential generalization to dynamic traffic environments. Representative qualitative detection and segmentation results are shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>, where our approach produces tighter and more consistent masks than the baseline under challenging urban conditions.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Qualitative results of object detection and segmentation of the reference model vs. our approach. Demonstrating precise bounding box generation and segmentation</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72544-fig-4.tif"/>
</fig>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion and Future Work</title>
<p>Our approach relies on synchronized radar and camera data, and its performance may degrade under adverse weather or poor sensor calibration. Real-time rendering currently requires high-end GPUs, which limits deployment on resource-constrained platforms. In addition, the effectiveness of Grounded-SAM depends on its pre-trained weights and prompt design, while radar data acquisition and calibration remain costly, posing challenges for large-scale deployment. Despite these limitations, experiments on the Waymo dataset&#x2014;covering diverse lighting, occlusion, and dynamic traffic&#x2014;demonstrate strong robustness and generalization to other urban datasets such as KITTI-360.</p>
<p>Future work will focus on three directions: improving 3D&#x2013;2D spatial consistency through hybrid loss functions and stronger multi-view alignment; integrating 3D Gaussians with lightweight implicit representations to reduce computational load; and extending the framework to larger-scale urban scenes and challenging sensing conditions such as rain, night, and sparse radar setups. These efforts aim to further enhance the scalability, efficiency, and robustness of SDG for real-world autonomous driving and smart city applications.</p>
</sec>
</body>
<back>
<ack>
<p>Not applicable.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This research is supported by Inha University.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Yu Du and Yan Li conceived the study and designed the overall framework, Yu Du implemented the proposed system, performed the experiments and analyzed the data, Runwei Guan contributed to the algorithm design and experimental methodology, Ho-Pun Lam and Yutao Yue provided technical guidance and helped refine the model architecture, and Jeremy Smith and Ka Lok Man reviewed and edited the manuscript. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The data that support the findings of this study are available from the corresponding author upon reasonable request.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mildenhall</surname> <given-names>B</given-names></string-name>, <string-name><surname>Srinivasan</surname> <given-names>PP</given-names></string-name>, <string-name><surname>Tancik</surname> <given-names>M</given-names></string-name>, <string-name><surname>Barron</surname> <given-names>JT</given-names></string-name>, <string-name><surname>Ramamoorthi</surname> <given-names>R</given-names></string-name>, <string-name><surname>Ng</surname> <given-names>R</given-names></string-name></person-group>. <article-title>NeRF: representing scenes as neural radiance fields for view synthesis</article-title>. <source>Commun ACM</source>. <year>2022</year>;<volume>65</volume>(<issue>1</issue>):<fpage>99</fpage>&#x2013;<lpage>106</lpage>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Tancik</surname> <given-names>M</given-names></string-name>, <string-name><surname>Casser</surname> <given-names>V</given-names></string-name>, <string-name><surname>Yan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Pradhan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Mildenhall</surname> <given-names>B</given-names></string-name>, <string-name><surname>Srinivasan</surname> <given-names>PP</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Block-NeRF: scalable large scene neural view synthesis</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2022</year>. p. <fpage>8248</fpage>&#x2013;<lpage>58</lpage>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Fei</surname> <given-names>B</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>W</given-names></string-name>, <string-name><surname>He</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>3D Gaussian splatting as a new era: a survey</article-title>. <source>IEEE Trans Visual Comput Graph</source>. <year>2025</year>;<volume>31</volume>(<issue>8</issue>):<fpage>4429</fpage>&#x2013;<lpage>49</lpage>. doi:<pub-id pub-id-type="doi">10.1109/tvcg.2024.3397828</pub-id>; <pub-id pub-id-type="pmid">38713572</pub-id></mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kerbl</surname> <given-names>B</given-names></string-name>, <string-name><surname>Kopanas</surname> <given-names>G</given-names></string-name>, <string-name><surname>Leimk&#x00FC;hler</surname> <given-names>T</given-names></string-name>, <string-name><surname>Drettakis</surname> <given-names>G</given-names></string-name></person-group>. <article-title>3D Gaussian splatting for real-time radiance field rendering</article-title>. <source>ACM Trans Graph</source>. <year>2023</year>;<volume>42</volume>(<issue>4</issue>):<fpage>139</fpage>. doi:<pub-id pub-id-type="doi">10.1145/3592433</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>H</given-names></string-name>, <string-name><surname>Cai</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Towards end-to-end embodied decision making via multi-modal large language model: explorations with GPT4-vision and beyond</article-title>. <comment>arXiv:2310.02071. 2023</comment>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Birkl</surname> <given-names>R</given-names></string-name>, <string-name><surname>Wofk</surname> <given-names>D</given-names></string-name>, <string-name><surname>M&#x00FC;ller</surname> <given-names>M</given-names></string-name></person-group>. <article-title>MiDaS v3.1&#x2014;a model zoo for robust monocular relative depth estimation</article-title>. <comment>arXiv:2307.14460. 2023</comment>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Sun</surname> <given-names>P</given-names></string-name>, <string-name><surname>Kretzschmar</surname> <given-names>H</given-names></string-name>, <string-name><surname>Dotiwalla</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chouard</surname> <given-names>A</given-names></string-name>, <string-name><surname>Patnaik</surname> <given-names>V</given-names></string-name>, <string-name><surname>Tsui</surname> <given-names>P</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Scalability in perception for autonomous driving: waymo open dataset</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2020</year>. p. <fpage>2446</fpage>&#x2013;<lpage>54</lpage>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Geiger</surname> <given-names>A</given-names></string-name>, <string-name><surname>Lenz</surname> <given-names>P</given-names></string-name>, <string-name><surname>Stiller</surname> <given-names>C</given-names></string-name>, <string-name><surname>Urtasun</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Vision meets robotics: the KITTI dataset</article-title>. <source>Int J Robot Res</source>. <year>2013</year>;<volume>32</volume>(<issue>11</issue>):<fpage>1231</fpage>&#x2013;<lpage>7</lpage>. doi:<pub-id pub-id-type="doi">10.1177/0278364913491297</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>K</given-names></string-name>, <string-name><surname>Riegler</surname> <given-names>G</given-names></string-name>, <string-name><surname>Snavely</surname> <given-names>N</given-names></string-name>, <string-name><surname>Koltun</surname> <given-names>V</given-names></string-name></person-group>. <article-title>NeRF&#x002B;&#x002B;: analyzing and improving neural radiance fields</article-title>. <comment>arXiv:2010.07492. 2020</comment>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Pumarola</surname> <given-names>A</given-names></string-name>, <string-name><surname>Corona</surname> <given-names>E</given-names></string-name>, <string-name><surname>Pons-Moll</surname> <given-names>G</given-names></string-name>, <string-name><surname>Moreno-Noguer</surname> <given-names>F</given-names></string-name></person-group>. <article-title>D-NeRF: neural radiance fields for dynamic scenes</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2021</year>. p. <fpage>10318</fpage>&#x2013;<lpage>27</lpage>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Niklaus</surname> <given-names>S</given-names></string-name>, <string-name><surname>Snavely</surname> <given-names>N</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>O</given-names></string-name></person-group>. <article-title>Neural scene flow fields for space-time view synthesis of dynamic scenes</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2021</year>. p. <fpage>6498</fpage>&#x2013;<lpage>508</lpage>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Su</surname> <given-names>L</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Yue</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Fu</surname> <given-names>M</given-names></string-name></person-group>. <article-title>MC-NeRF: multi-camera neural radiance fields for multi-camera image acquisition systems</article-title>. <source>IEEE Trans Visual Comput Graph</source>. <year>2025</year>;<volume>31</volume>(<issue>10</issue>):<fpage>7391</fpage>&#x2013;<lpage>406</lpage>. doi:<pub-id pub-id-type="doi">10.1109/tvcg.2025.3546290</pub-id>; <pub-id pub-id-type="pmid">40031621</pub-id></mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Qi</surname> <given-names>CR</given-names></string-name>, <string-name><surname>Su</surname> <given-names>H</given-names></string-name>, <string-name><surname>Mo</surname> <given-names>K</given-names></string-name>, <string-name><surname>Guibas</surname> <given-names>LJ</given-names></string-name></person-group>. <article-title>PointNet: deep learning on point sets for 3D classification and segmentation</article-title>. In: <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2017</year>. p. <fpage>77</fpage>&#x2013;<lpage>85</lpage>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Qi</surname> <given-names>CR</given-names></string-name>, <string-name><surname>Yi</surname> <given-names>L</given-names></string-name>, <string-name><surname>Su</surname> <given-names>H</given-names></string-name>, <string-name><surname>Guibas</surname> <given-names>LJ</given-names></string-name></person-group>. <chapter-title>PointNet&#x002B;&#x002B;: deep hierarchical feature learning on point sets in a metric space</chapter-title>. In: <source>Advances in neural information processing systems (NeurIPS)</source>. Vol. <volume>30</volume>. <publisher-loc>London, UK</publisher-loc>: <publisher-name>PMLR</publisher-name>; <year>2017</year>. p. <fpage>5099</fpage>&#x2013;<lpage>108</lpage>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wu</surname> <given-names>G</given-names></string-name>, <string-name><surname>Yi</surname> <given-names>T</given-names></string-name>, <string-name><surname>Fang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>W</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>4D Gaussian splatting for real-time dynamic scene rendering</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2024</year>. p. <fpage>20310</fpage>&#x2013;<lpage>20</lpage>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Fridovich-Keil</surname> <given-names>S</given-names></string-name>, <string-name><surname>Meanti</surname> <given-names>G</given-names></string-name>, <string-name><surname>Warburg</surname> <given-names>FR</given-names></string-name>, <string-name><surname>Recht</surname> <given-names>B</given-names></string-name>, <string-name><surname>Kanazawa</surname> <given-names>A</given-names></string-name></person-group>. <article-title>K-Planes: explicit radiance fields in space, time, and appearance</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2023</year>. p. <fpage>12479</fpage>&#x2013;<lpage>88</lpage>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Guo</surname> <given-names>J</given-names></string-name>, <string-name><surname>Deng</surname> <given-names>N</given-names></string-name>, <string-name><surname>Li</surname> <given-names>X</given-names></string-name>, <string-name><surname>Bai</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Shi</surname> <given-names>B</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>C</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>StreetSurf: extending multi-view implicit surface reconstruction to street views</article-title>. <comment>arXiv:2306.04988. 2023</comment>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>H</given-names></string-name>, <string-name><surname>Jung</surname> <given-names>M</given-names></string-name>, <string-name><surname>Noh</surname> <given-names>C</given-names></string-name>, <string-name><surname>Jung</surname> <given-names>S</given-names></string-name>, <string-name><surname>Song</surname> <given-names>H</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>W</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>HeRCULES: heterogeneous radar dataset in complex urban environment for multi-session radar SLAM</article-title>. In: <conf-name>2025 IEEE International Conference on Robotics and Automation (ICRA)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2025</year>. p. <fpage>4649</fpage>&#x2013;<lpage>56</lpage>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jin</surname> <given-names>T</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>RCMixer: radar-camera fusion based on vision for robust object detection</article-title>. <source>J Vis Commun Image Rep</source>. <year>2024</year>;<volume>95</volume>:<fpage>103880</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.jvcir.2024.104367</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Xiao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Fu</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Radar-camera fusion in perspective view and bird&#x2019;s eye view</article-title>. <source>Sensors</source>. <year>2025</year>;<volume>25</volume>(<issue>19</issue>):<fpage>6106</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s25196106</pub-id>; <pub-id pub-id-type="pmid">41094932</pub-id></mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Pravallika</surname> <given-names>A</given-names></string-name>, <string-name><surname>Hashmi</surname> <given-names>MF</given-names></string-name>, <string-name><surname>Gupta</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Deep learning frontiers in 3D object detection: a comprehensive review for autonomous driving</article-title>. <source>IEEE Access</source>. <year>2024</year>;<volume>12</volume>:<fpage>173936</fpage>&#x2013;<lpage>80</lpage>. doi:<pub-id pub-id-type="doi">10.1109/access.2024.3456893</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wei</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>F</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>MmWave radar and vision fusion for object detection in autonomous driving: a review</article-title>. <source>Sensors</source>. <year>2022</year>;<volume>22</volume>(<issue>7</issue>):<fpage>2542</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s22072542</pub-id>; <pub-id pub-id-type="pmid">35408157</pub-id></mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>H</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>MARS: an instance-aware, modular and realistic simulator for autonomous driving</article-title>. In: <conf-name>Artificial Intelligence: Third CAAI International Conference, CICAI 2023; 2023 July 22&#x2013;23</conf-name>; <publisher-loc>Fuzhou, China. Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2024</year>. p. <fpage>3</fpage>&#x2013;<lpage>15</lpage>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Straub</surname> <given-names>J</given-names></string-name>, <string-name><surname>Freifeld</surname> <given-names>O</given-names></string-name>, <string-name><surname>Rosman</surname> <given-names>G</given-names></string-name>, <string-name><surname>Leonard</surname> <given-names>JJ</given-names></string-name>, <string-name><surname>Fisher</surname> <given-names>JW</given-names></string-name></person-group>. <article-title>The manhattan frame model&#x2014;manhattan world inference in the space of surface normals</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2018</year>;<volume>40</volume>(<issue>1</issue>):<fpage>235</fpage>&#x2013;<lpage>49</lpage>. doi:<pub-id pub-id-type="doi">10.1109/tpami.2017.2662686</pub-id>; <pub-id pub-id-type="pmid">28166490</pub-id></mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Whelan</surname> <given-names>T</given-names></string-name>, <string-name><surname>Salas-Moreno</surname> <given-names>RF</given-names></string-name>, <string-name><surname>Glocker</surname> <given-names>B</given-names></string-name>, <string-name><surname>Davison</surname> <given-names>AJ</given-names></string-name>, <string-name><surname>Leutenegger</surname> <given-names>S</given-names></string-name></person-group>. <article-title>ElasticFusion: real-time dense SLAM and light source estimation</article-title>. <source>Int J Robot Res</source>. <year>2016</year>;<volume>35</volume>(<issue>14</issue>):<fpage>1697</fpage>&#x2013;<lpage>716</lpage>. doi:<pub-id pub-id-type="doi">10.1177/0278364916669237</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kr&#x00E4;henb&#x00FC;hl</surname> <given-names>P</given-names></string-name>, <string-name><surname>Koltun</surname> <given-names>V</given-names></string-name></person-group>. <article-title>Efficient inference in fully connected CRFs with Gaussian edge potentials</article-title>. In: <conf-name>NIPS&#x2019;11: Proceedings of the 25th International Conference on Neural Information Processing Systems</conf-name>. <publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>; <year>2011</year>. p. <fpage>109</fpage>&#x2013;<lpage>17</lpage>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Straub</surname> <given-names>J</given-names></string-name>, <string-name><surname>Bhandari</surname> <given-names>N</given-names></string-name>, <string-name><surname>Leonard</surname> <given-names>JJ</given-names></string-name>, <string-name><surname>Fisher</surname> <given-names>JW</given-names></string-name></person-group>. <article-title>Real-time manhattan world rotation estimation in 3D</article-title>. In: <conf-name>Proceedings of the IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2015</year>. p. <fpage>1913</fpage>&#x2013;<lpage>20</lpage>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Joo</surname> <given-names>K</given-names></string-name>, <string-name><surname>Oh</surname> <given-names>TH</given-names></string-name>, <string-name><surname>Kweon</surname> <given-names>IS</given-names></string-name>, <string-name><surname>Bazin</surname> <given-names>JC</given-names></string-name></person-group>. <article-title>Globally optimal inlier set maximization for Atlanta frame estimation</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2018</year>. p. <fpage>5726</fpage>&#x2013;<lpage>34</lpage>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Sodhi</surname> <given-names>D</given-names></string-name>, <string-name><surname>Upadhyay</surname> <given-names>S</given-names></string-name>, <string-name><surname>Bhatt</surname> <given-names>D</given-names></string-name>, <string-name><surname>Krishna</surname> <given-names>KM</given-names></string-name>, <string-name><surname>Swarup</surname> <given-names>S</given-names></string-name></person-group>. <article-title>CRF based method for curb detection using semantic cues and stereo depth</article-title>. In: <conf-name>ICVGIP &#x2019;16: Proceedings of the Tenth Indian Conference on Computer Vision, Graphics and Image Processing</conf-name>. <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>ACM</publisher-name>; <year>2016</year>. p. <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Arshad</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sualeh</surname> <given-names>M</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>D</given-names></string-name>, <string-name><surname>Nam</surname> <given-names>DV</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Clothoid: an integrated hierarchical framework for autonomous driving in a dynamic Urban environment</article-title>. <source>Sensors</source>. <year>2020</year>;<volume>20</volume>(<issue>18</issue>):<fpage>5053</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s20185053</pub-id>; <pub-id pub-id-type="pmid">32899543</pub-id></mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Cudrano</surname> <given-names>P</given-names></string-name>, <string-name><surname>Gallazzi</surname> <given-names>B</given-names></string-name>, <string-name><surname>Frosi</surname> <given-names>M</given-names></string-name>, <string-name><surname>Mentasti</surname> <given-names>S</given-names></string-name>, <string-name><surname>Matteucci</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Clothoid-based lane-level high-definition maps: unifying sensing and control models</article-title>. <source>IEEE Veh Technol Mag</source>. <year>2022</year>;<volume>17</volume>(<issue>4</issue>):<fpage>47</fpage>&#x2013;<lpage>56</lpage>. doi:<pub-id pub-id-type="doi">10.1109/mvt.2022.3209503</pub-id>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Xie</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Gadelha</surname> <given-names>M</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>F</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>X</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>H</given-names></string-name></person-group>. <article-title>PlanarRecon: real-time 3D plane detection and reconstruction from posed monocular videos</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2022</year>. p. <fpage>6219</fpage>&#x2013;<lpage>28</lpage>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>K</given-names></string-name>, <string-name><surname>Gu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Furukawa</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Kautz</surname> <given-names>J</given-names></string-name></person-group>. <article-title>PlaneRCNN: 3D plane detection and reconstruction from a single image</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2019</year>. p. <fpage>4450</fpage>&#x2013;<lpage>9</lpage>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kirillov</surname> <given-names>A</given-names></string-name>, <string-name><surname>Mintun</surname> <given-names>E</given-names></string-name>, <string-name><surname>Ravi</surname> <given-names>N</given-names></string-name>, <string-name><surname>Mao</surname> <given-names>H</given-names></string-name>, <string-name><surname>Rolland</surname> <given-names>C</given-names></string-name>, <string-name><surname>Gustafson</surname> <given-names>L</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Segment anything</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2023</year>. p. <fpage>3992</fpage>&#x2013;<lpage>4003</lpage>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Ren</surname> <given-names>T</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zeng</surname> <given-names>A</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>K</given-names></string-name>, <string-name><surname>Cao</surname> <given-names>H</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Grounded SAM: assembling open-world models for diverse visual tasks</article-title>. <comment>arXiv:2401.14159. 2024</comment>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Improving depth gradient continuity in transformers: a comparative study on monocular depth estimation with CNN</article-title>. In: <conf-name>Proceedings of the 35th British Machine Vision Conference (BMVC)</conf-name>; <year>2024 Nov 25&#x2013;28</year>; <publisher-loc>Glasgow, UK</publisher-loc>. p. <fpage>1</fpage>&#x2013;<lpage>13</lpage>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Versaci</surname> <given-names>M</given-names></string-name>, <string-name><surname>Morabito</surname> <given-names>FC</given-names></string-name></person-group>. <article-title>Image edge detection: a new approach based on fuzzy entropy and fuzzy divergence</article-title>. <source>Int J Fuzzy Syst</source>. <year>2021</year>;<volume>23</volume>(<issue>4</issue>):<fpage>918</fpage>&#x2013;<lpage>36</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s40815-020-01030-5</pub-id>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Bhat</surname> <given-names>SF</given-names></string-name>, <string-name><surname>Birkl</surname> <given-names>R</given-names></string-name>, <string-name><surname>Wofk</surname> <given-names>D</given-names></string-name>, <string-name><surname>Wonka</surname> <given-names>P</given-names></string-name>, <string-name><surname>M&#x00FC;ller</surname> <given-names>M</given-names></string-name></person-group>. <article-title>ZoeDepth: zero-shot transfer by combining relative and metric depth</article-title>. <comment>arXiv:2302.12288. 2023</comment>.</mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Annaby</surname> <given-names>MH</given-names></string-name>, <string-name><surname>Al-Abdi</surname> <given-names>IA</given-names></string-name></person-group>. <article-title>A Gaussian regularization for derivative sampling interpolation of signals in the linear canonical transform representations</article-title>. <source>Signal Image Video Process</source>. <year>2023</year>;<volume>17</volume>:<fpage>2157</fpage>&#x2013;<lpage>65</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11760-022-02430-w</pub-id>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Nilsson</surname> <given-names>J</given-names></string-name>, <string-name><surname>Akenine-M&#x00F6;ller</surname> <given-names>T</given-names></string-name></person-group>. <article-title>Understanding SSIM</article-title>. <comment>arXiv:2006.13846. 2020</comment>.</mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kaur</surname> <given-names>R</given-names></string-name>, <string-name><surname>Kumar</surname> <given-names>R</given-names></string-name>, <string-name><surname>Gupta</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Review on transfer learning for convolutional neural network</article-title>. In: <conf-name>Proceedings of the 2021 3rd International Conference on Advances in Computing, Communication Control and Networking (ICAC3N)</conf-name>; <year>2021 Dec 17&#x2013;18</year>; <publisher-loc>Greater Noida, India</publisher-loc>. p. <fpage>922</fpage>&#x2013;<lpage>6</lpage>.</mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yan</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhan</surname> <given-names>K</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Street Gaussians: modeling dynamic urban scenes with gaussian splatting</article-title>. In: <conf-name>Computer Vision&#x2014;ECCV 2024: 18th European Conference</conf-name>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>; <year>2024</year>. p. <fpage>156</fpage>&#x2013;<lpage>73</lpage>.</mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhou</surname> <given-names>H</given-names></string-name>, <string-name><surname>Shao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Bai</surname> <given-names>D</given-names></string-name>, <string-name><surname>Qiu</surname> <given-names>W</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>B</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>HUGS: Holistic Urban 3D scene understanding via Gaussian splatting</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2024</year>. p. <fpage>21336</fpage>&#x2013;<lpage>45</lpage>.</mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Miao</surname> <given-names>S</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Bai</surname> <given-names>D</given-names></string-name>, <string-name><surname>Yan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>EVolSplat: efficient volume-based Gaussian splatting for Urban view synthesis</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2025</year>. p. <fpage>11286</fpage>&#x2013;<lpage>96</lpage>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Hwang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>MJ</given-names></string-name>, <string-name><surname>Kang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Choo</surname> <given-names>J</given-names></string-name></person-group>. <chapter-title>VEGS: view extrapolation of urban scenes in 3D Gaussian splatting using learned priors</chapter-title>. In: <source>Computer Vision&#x2013;ECCV 2024</source>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>; <year>2025</year>. p. <fpage>1</fpage>&#x2013;<lpage>18</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-031-73001-6_1</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>