{"id":1082,"date":"2026-03-23T06:30:19","date_gmt":"2026-03-23T06:30:19","guid":{"rendered":"https:\/\/ouyangminwei.com\/?p=1082"},"modified":"2026-03-23T06:30:20","modified_gmt":"2026-03-23T06:30:20","slug":"kimi-attnres","status":"publish","type":"post","link":"https:\/\/ouyangminwei.com\/index.php\/2026\/03\/23\/kimi-attnres\/","title":{"rendered":"Kimi Attnres"},"content":{"rendered":"\n<!DOCTYPE html>\n<html lang=\"zh-TW\" class=\"scroll-smooth\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n    <title>\u6df1\u5165\u6dfa\u51fa Kimi AttnRes\uff1a\u6ce8\u610f\u529b\u6b98\u5dee\u6a5f\u5236\u7684\u6578\u5b78\u8207\u76f4\u89ba<\/title>\n    \n    <!-- Tailwind CSS -->\n    <script src=\"https:\/\/cdn.tailwindcss.com\"><\/script>\n    \n    <!-- Chart.js -->\n    <script src=\"https:\/\/cdn.jsdelivr.net\/npm\/chart.js\"><\/script>\n\n    <!-- MathJax for LaTeX rendering -->\n    <script>\n        MathJax = {\n            tex: {\n                inlineMath: [['$', '$']],\n                displayMath: [['$$', '$$']]\n            },\n            svg: {\n                fontCache: 'global'\n            }\n        };\n    <\/script>\n    <script type=\"text\/javascript\" id=\"MathJax-script\" async\n      src=\"https:\/\/cdn.jsdelivr.net\/npm\/mathjax@3\/es5\/tex-mml-chtml.js\">\n    <\/script>\n\n    <script>\n        tailwind.config = {\n            theme: {\n                extend: {\n                    colors: {\n                        stone: {\n                            50: '#fafaf9',\n                            100: '#f5f5f4',\n                            200: '#e7e5e4',\n                            300: '#d6d3d1',\n                            700: '#44403c',\n                            800: '#292524',\n                            900: '#1c1917',\n                        },\n                        teal: {\n                            600: '#0d9488',\n                            700: '#0f766e',\n                        }\n                    },\n                    fontFamily: {\n                        sans: ['Inter', 'system-ui', 'sans-serif'],\n                        serif: ['Merriweather', 'serif'],\n                        mono: ['Fira Code', 'monospace'],\n                    }\n                }\n            }\n        }\n    <\/script>\n\n    <style>\n        \/* CSS required by instructions for chart containers *\/\n        .chart-container {\n            position: relative;\n            width: 100%;\n            max-width: 800px;\n            margin-left: auto;\n            margin-right: auto;\n            height: 350px;\n            max-height: 450px;\n        }\n        @media (min-width: 768px) {\n            .chart-container {\n                height: 400px;\n            }\n        }\n\n        \/* Custom scrollbar for a polished look *\/\n        ::-webkit-scrollbar {\n            width: 8px;\n        }\n        ::-webkit-scrollbar-track {\n            background: #f5f5f4; \n        }\n        ::-webkit-scrollbar-thumb {\n            background: #d6d3d1; \n            border-radius: 4px;\n        }\n        ::-webkit-scrollbar-thumb:hover {\n            background: #a8a29e; \n        }\n\n        \/* Math container transition *\/\n        .math-step-content {\n            display: none;\n            opacity: 0;\n            transition: opacity 0.3s ease-in-out;\n        }\n        .math-step-content.active {\n            display: block;\n            opacity: 1;\n        }\n        \n        .nav-link.active {\n            color: #0d9488;\n            border-bottom: 2px solid #0d9488;\n        }\n    <\/style>\n<\/head>\n<body class=\"bg-stone-50 text-stone-800 font-sans antialiased leading-relaxed tracking-wide\">\n\n    <!-- Chosen Palette: Warm Neutrals (Stone) with Teal accents for a calm, academic, yet modern feel. -->\n    \n    <!-- Application Structure Plan: \n         1. Header\/Nav: Sticky navigation for easy movement between complex topics.\n         2. Hero\/Intro: Establishes context for users who already know Attention.\n         3. The Bottleneck (Standard Attention): Explains the 'why' before the 'what'. Uses simple layout to contrast deep layer degradation.\n         4. Core Concept (AttnRes Architecture): Uses structural HTML\/CSS boxes to show the bypass mechanism without SVG. Interactive toggle to compare.\n         5. The Math Decoded: The core requirement. A step-by-step interactive math explainer. Prevents cognitive overload by hiding\/showing equations.\n         6. Metrics\/Impact: Chart.js visualizations showing the empirical benefits (loss convergence, stability). \n         Why this structure? It follows a logical pedagogical flow: Context -> Problem -> Intuitive Solution -> Mathematical Rigor -> Empirical Proof. -->\n\n    <!-- Visualization & Content Choices: \n         - Architecture Diagram: Goal - Compare standard vs AttnRes. Method - HTML\/CSS flexbox grid with Unicode arrows. Justification - Meets NO SVG constraint while clearly showing the residual connection pathway.\n         - Math Explainer: Goal - Teach the derivation. Method - Interactive tabs with MathJax. Justification - Math equations in deep learning are overwhelming all at once. Step-by-step interaction aids understanding.\n         - Training Loss Chart: Goal - Show performance gain. Method - Chart.js Line Chart. Justification - Standard way to show convergence.\n         - Gradient Norm Chart: Goal - Prove the mathematical claim of reduced gradient vanishing. Method - Chart.js Bar Chart. Justification - Clear comparison of layer-wise gradient scales. \n         CONFIRMATION: NO SVG graphics used. NO Mermaid JS used. -->\n\n    <!-- Navigation -->\n    <nav class=\"sticky top-0 z-50 bg-stone-50\/90 backdrop-blur-md border-b border-stone-200\">\n        <div class=\"max-w-6xl mx-auto px-4 sm:px-6 lg:px-8\">\n            <div class=\"flex justify-between h-16\">\n                <div class=\"flex items-center\">\n                    <span class=\"text-xl font-bold text-stone-900 tracking-tight\">Kimi <span class=\"text-teal-600\">AttnRes<\/span><\/span>\n                <\/div>\n                <div class=\"hidden md:flex space-x-8 items-center\">\n                    <a href=\"#intro\" class=\"nav-link active text-sm font-medium text-stone-600 hover:text-teal-600 transition-colors py-2\">\u7c21\u4ecb\u8207\u75db\u9ede<\/a>\n                    <a href=\"#architecture\" class=\"nav-link text-sm font-medium text-stone-600 hover:text-teal-600 transition-colors py-2\">\u67b6\u69cb\u76f4\u89ba<\/a>\n                    <a href=\"#math-deep-dive\" class=\"nav-link text-sm font-medium text-stone-600 hover:text-teal-600 transition-colors py-2\">\u6578\u5b78\u63a8\u5c0e<\/a>\n                    <a href=\"#metrics\" class=\"nav-link text-sm font-medium text-stone-600 hover:text-teal-600 transition-colors py-2\">\u6548\u80fd\u5be6\u8b49<\/a>\n                <\/div>\n            <\/div>\n        <\/div>\n    <\/nav>\n\n    <main class=\"max-w-5xl mx-auto px-4 sm:px-6 lg:px-8 py-12 space-y-24\">\n\n        <!-- Section: Intro & Problem -->\n        <section id=\"intro\" class=\"scroll-mt-24\">\n            <header class=\"mb-12 text-center\">\n                <h1 class=\"text-4xl md:text-5xl font-extrabold text-stone-900 mb-6 leading-tight\">\n                    \u6253\u7834\u6df1\u5c64\u7db2\u8def\u7684\u6ce8\u610f\u529b\u74f6\u9838\uff1a<br>\n                    <span class=\"text-teal-700\">AttnRes \u6b98\u5dee\u6a5f\u5236\u89e3\u6790<\/span>\n                <\/h1>\n                <p class=\"text-lg text-stone-600 max-w-3xl mx-auto\">\n                    \u5982\u679c\u4f60\u5df2\u7d93\u719f\u6089 Transformer \u7684 QKV \u6a5f\u5236\uff0c\u4f60\u4e00\u5b9a\u77e5\u9053\u5728\u6975\u6df1\u5c64\u7684\u7db2\u8def\u4e2d\uff0cAttention \u77e9\u9663\u5bb9\u6613\u8da8\u65bc\u5e73\u6ed1\uff08Attention Collapse\uff09\uff0c\u5c0e\u81f4\u68af\u5ea6\u6d88\u5931\u3002Kimi \u5718\u968a\uff08\u53ca\u8fd1\u4ee3\u5927\u6a21\u578b\u7814\u7a76\uff09\u63d0\u51fa\u7684 <strong>AttnRes (Attention Residuals)<\/strong> \u6b63\u662f\u70ba\u4e86\u89e3\u6c7a\u9019\u500b\u6578\u5b78\u75db\u9ede\u3002\n                <\/p>\n            <\/header>\n\n            <div class=\"bg-white rounded-2xl shadow-sm border border-stone-200 p-8\">\n                <h2 class=\"text-2xl font-bold text-stone-800 mb-4 border-l-4 border-teal-600 pl-4\">\u50b3\u7d71 Attention \u7684\u81f4\u547d\u50b7\uff1aSoftmax \u7684\u8caa\u5a6a<\/h2>\n                <p class=\"text-stone-700 mb-6\">\n                    \u5728\u6a19\u6e96\u67b6\u69cb\u4e2d\uff0c\u7b2c $l$ \u5c64\u7684\u6ce8\u610f\u529b\u5206\u6578\u8a08\u7b97\u70ba\uff1a\n                <\/p>\n                <div class=\"bg-stone-100 p-4 rounded-lg overflow-x-auto mb-6 text-center font-mono text-lg\">\n                    $$A^{(l)} = \\text{Softmax}\\left(\\frac{Q^{(l)} (K^{(l)})^T}{\\sqrt{d_k}}\\right)$$\n                <\/div>\n                <div class=\"grid md:grid-cols-2 gap-8 mt-8\">\n                    <div>\n                        <h3 class=\"font-semibold text-lg text-stone-800 mb-2\">1. \u68af\u5ea6\u6d88\u5931 (Gradient Vanishing)<\/h3>\n                        <p class=\"text-sm text-stone-600\">\n                            Softmax \u51fd\u6578\u5728\u8f38\u5165\u503c\u5dee\u7570\u8f03\u5927\u6642\uff0c\u8f38\u51fa\u6703\u6975\u5ea6\u8da8\u8fd1\u65bc 0 \u6216 1\uff08One-hot \u5206\u4f48\uff09\u3002\u9019\u5c0e\u81f4\u5728\u53cd\u5411\u50b3\u64ad\u6642\uff0cSoftmax \u7684 Jacobian \u77e9\u9663\u8da8\u8fd1\u65bc\u96f6\u77e9\u9663\uff0c\u68af\u5ea6\u7121\u6cd5\u6709\u6548\u50b3\u905e\u5230\u6dfa\u5c64\u7684 Q \u548c K \u6b0a\u91cd\u3002\n                        <\/p>\n                    <\/div>\n                    <div>\n                        <h3 class=\"font-semibold text-lg text-stone-800 mb-2\">2. \u6ce8\u610f\u529b\u5d29\u584c (Attention Collapse)<\/h3>\n                        <p class=\"text-sm text-stone-600\">\n                            \u96a8\u8457\u5c64\u6578\u589e\u52a0\uff08\u4f8b\u5982 > 64 \u5c64\uff09\uff0c\u4e0d\u540c\u5c64\u7684 Attention \u77e9\u9663\u6703\u9010\u6f38\u540c\u8cea\u5316\uff0c\u6240\u6709 Token \u7684\u6ce8\u610f\u529b\u5206\u4f48\u8b8a\u5f97\u4e00\u6a21\u4e00\u6a23\uff0c\u5931\u53bb\u4e86\u6355\u6349\u591a\u6a23\u5316\u7279\u5fb5\u7684\u80fd\u529b\uff0c\u5f62\u540c\u7db2\u8def\u9000\u5316\u3002\n                        <\/p>\n                    <\/div>\n                <\/div>\n            <\/div>\n        <\/section>\n\n        <!-- Section: Architecture \/ Core Concept -->\n        <section id=\"architecture\" class=\"scroll-mt-24\">\n            <div class=\"mb-8\">\n                <h2 class=\"text-3xl font-bold text-stone-900 mb-4\">\u67b6\u69cb\u76f4\u89ba\uff1a\u8b93 Attention \u77e9\u9663\u300c\u8a18\u4f4f\u300d\u904e\u53bb<\/h2>\n                <p class=\"text-stone-700\">\n                    \u672c\u5340\u584a\u6bd4\u8f03\u50b3\u7d71 Attention \u8207 AttnRes \u7684\u67b6\u69cb\u5dee\u7570\u3002\u8207\u5176\u8b93\u6bcf\u4e00\u5c64\u5f9e\u982d\u8a08\u7b97 Q \u548c K \u7684\u76f8\u4f3c\u5ea6\uff0c<strong>AttnRes \u5c07\u4e0a\u4e00\u5c64\u7684\u6ce8\u610f\u529b\u5206\u6578\uff08\u6216 Logits\uff09\u4f5c\u70ba\u6b98\u5dee\uff08Residual\uff09\u76f4\u63a5\u52a0\u5230\u7576\u524d\u5c64\u4e2d\u3002<\/strong> \u9019\u6a23\u4e0d\u50c5\u4fdd\u7559\u4e86\u524d\u4e00\u5c64\u7684\u7279\u5fb5\uff0c\u66f4\u70ba\u68af\u5ea6\u63d0\u4f9b\u4e86\u4e00\u689d\u300c\u9ad8\u901f\u516c\u8def\u300d\u3002\n                <\/p>\n            <\/div>\n\n            <!-- Interactive Architecture Diagram (HTML\/CSS Based) -->\n            <div class=\"bg-stone-900 rounded-2xl p-6 md:p-10 shadow-lg text-stone-200 relative\">\n                \n                <div class=\"flex justify-center mb-8\">\n                    <div class=\"bg-stone-800 p-1 rounded-lg inline-flex\">\n                        <button id=\"btn-std-arch\" class=\"px-6 py-2 rounded-md font-medium text-sm transition-colors bg-teal-600 text-white shadow\">\u50b3\u7d71 Attention<\/button>\n                        <button id=\"btn-res-arch\" class=\"px-6 py-2 rounded-md font-medium text-sm transition-colors text-stone-400 hover:text-white\">AttnRes \u6a5f\u5236<\/button>\n                    <\/div>\n                <\/div>\n\n                <!-- Standard Architecture Block -->\n                <div id=\"arch-std\" class=\"flex flex-col items-center space-y-4 animate-fade-in\">\n                    <div class=\"px-6 py-3 bg-stone-800 border border-stone-700 rounded-lg text-center w-64 shadow-md\">\n                        <span class=\"font-mono text-sm text-stone-400\">Layer $(l-1)$<\/span><br>\n                        <span class=\"font-bold\">\u96b1\u85cf\u72c0\u614b $X^{(l-1)}$<\/span>\n                    <\/div>\n                    <div class=\"text-2xl text-stone-600\">\u2b07<\/div>\n                    <div class=\"flex space-x-4\">\n                        <div class=\"px-4 py-2 bg-blue-900\/30 border border-blue-800 text-blue-300 rounded text-sm w-20 text-center\">$Q^{(l)}$<\/div>\n                        <div class=\"px-4 py-2 bg-blue-900\/30 border border-blue-800 text-blue-300 rounded text-sm w-20 text-center\">$K^{(l)}$<\/div>\n                    <\/div>\n                    <div class=\"text-2xl text-stone-600\">\u2b07<\/div>\n                    <div class=\"px-6 py-3 bg-stone-800 border border-stone-700 rounded-lg text-center w-64 shadow-md\">\n                        <span class=\"font-mono text-sm text-stone-400\">\u77e9\u9663\u4e58\u6cd5 &#038; \u7e2e\u653e<\/span><br>\n                        <span class=\"font-bold\">$\\frac{Q \\cdot K^T}{\\sqrt{d}}$<\/span>\n                    <\/div>\n                    <div class=\"text-2xl text-stone-600\">\u2b07<\/div>\n                    <div class=\"px-6 py-3 bg-purple-900\/40 border border-purple-800 text-purple-200 rounded-lg text-center w-64 shadow-md font-bold\">\n                        Softmax $(A^{(l)})$\n                    <\/div>\n                    <div class=\"text-2xl text-stone-600\">\u2b07<\/div>\n                    <div class=\"px-6 py-3 bg-stone-800 border border-stone-700 rounded-lg text-center w-64 shadow-md\">\n                        <span class=\"font-bold\">\u8f38\u51fa\u81f3\u4e0b\u5c64 $X^{(l)}$<\/span>\n                    <\/div>\n                <\/div>\n\n                <!-- AttnRes Architecture Block -->\n                <div id=\"arch-res\" class=\"hidden flex flex-col items-center space-y-4 animate-fade-in\">\n                    <div class=\"flex justify-center space-x-12 w-full max-w-2xl\">\n                        <!-- Left Path -->\n                        <div class=\"flex flex-col items-center space-y-4 w-1\/2 relative\">\n                            <div class=\"px-6 py-3 bg-stone-800 border border-stone-700 rounded-lg text-center w-48 shadow-md\">\n                                <span class=\"font-mono text-sm text-stone-400\">Layer $(l-1)$<\/span><br>\n                                <span class=\"font-bold text-sm\">\u72c0\u614b $X^{(l-1)}$<\/span>\n                            <\/div>\n                            <div class=\"text-xl text-stone-600\">\u2b07<\/div>\n                            <div class=\"flex justify-center space-x-2 w-full\">\n                                <div class=\"px-2 py-2 bg-blue-900\/30 border border-blue-800 text-blue-300 rounded text-xs w-16 text-center\">$Q^{(l)}$<\/div>\n                                <div class=\"px-2 py-2 bg-blue-900\/30 border border-blue-800 text-blue-300 rounded text-xs w-16 text-center\">$K^{(l)}$<\/div>\n                            <\/div>\n                            <div class=\"text-xl text-stone-600\">\u2b07<\/div>\n                            <div class=\"px-4 py-2 bg-stone-800 border border-stone-700 rounded-lg text-center w-48 shadow-md\">\n                                <span class=\"font-bold text-sm\">\u65b0\u7279\u5fb5 $\\frac{Q \\cdot K^T}{\\sqrt{d}}$<\/span>\n                            <\/div>\n                            <div class=\"text-xl text-stone-600\">\u2b07<\/div>\n                        <\/div>\n\n                        <!-- Right Path (Residual) -->\n                        <div class=\"flex flex-col items-center space-y-4 w-1\/2\">\n                            <div class=\"px-6 py-3 bg-teal-900\/40 border border-teal-700 rounded-lg text-center w-48 shadow-md text-teal-200\">\n                                <span class=\"font-mono text-xs text-teal-400\">\u4f86\u81ea\u4e0a\u4e00\u5c64\u7684 Logits<\/span><br>\n                                <span class=\"font-bold text-sm\">$M^{(l-1)}$<\/span>\n                            <\/div>\n                            <!-- Arrow bridging down -->\n                            <div class=\"h-24 border-r-2 border-dashed border-teal-600 relative\">\n                                <div class=\"absolute -bottom-2 -left-3 text-teal-600 text-xl\">\u2b07<\/div>\n                            <\/div>\n                        <\/div>\n                    <\/div>\n\n                    <!-- Merge Point -->\n                    <div class=\"flex justify-center items-center w-64 h-12 border-2 border-teal-500 rounded-full bg-teal-900\/20 text-teal-300 font-bold mb-4 relative z-10 shadow-[0_0_15px_rgba(13,148,136,0.3)]\">\n                        + (\u77e9\u9663\u76f8\u52a0)\n                    <\/div>\n                    \n                    <div class=\"px-6 py-3 bg-purple-900\/40 border border-purple-800 text-purple-200 rounded-lg text-center w-64 shadow-md font-bold\">\n                        Softmax $(A^{(l)})$\n                    <\/div>\n                    <div class=\"text-2xl text-stone-600\">\u2b07<\/div>\n                    <div class=\"px-6 py-3 bg-stone-800 border border-stone-700 rounded-lg text-center w-64 shadow-md\">\n                        <span class=\"font-bold\">\u8f38\u51fa\u81f3\u4e0b\u5c64 $X^{(l)}$<\/span>\n                    <\/div>\n                <\/div>\n\n                <div class=\"mt-8 text-center text-sm text-stone-400 bg-stone-800\/50 rounded p-4\">\n                    <span class=\"text-teal-400 font-bold\">\u6838\u5fc3\u5dee\u7570\uff1a<\/span> \u50b3\u7d71\u6b98\u5dee\u52a0\u5728 $X$ (Feed Forward \u8f38\u51fa)\uff0c\u800c <strong>AttnRes \u5c07\u6b98\u5dee\u6a5f\u5236\u76f4\u63a5\u505a\u9032 Attention \u7684 Logits (Softmax \u4e4b\u524d) \u88e1\u9762<\/strong>\u3002\n                <\/div>\n            <\/div>\n        <\/section>\n\n        <!-- Section: Math Deep Dive -->\n        <section id=\"math-deep-dive\" class=\"scroll-mt-24\">\n            <div class=\"mb-8 border-b border-stone-200 pb-4\">\n                <h2 class=\"text-3xl font-bold text-stone-900 mb-2\">\u6578\u5b78\u63a8\u5c0e\uff1a\u70ba\u4ec0\u9ebc\u5b83\u80fd\u6253\u901a\u68af\u5ea6\uff1f<\/h2>\n                <p class=\"text-stone-600\">\n                    \u672c\u5340\u584a\u5c07\u4ee5\u4e92\u52d5\u65b9\u5f0f\u62c6\u89e3 AttnRes \u7684\u6578\u5b78\u516c\u5f0f\u3002\u9ede\u64ca\u4e0b\u65b9\u6b65\u9a5f\uff0c\u6df1\u5165\u4e86\u89e3 Attention Logits \u7684\u6b98\u5dee\u5982\u4f55\u6539\u8b8a\u53cd\u5411\u50b3\u64ad\u7684\u547d\u904b\u3002\n                <\/p>\n            <\/div>\n\n            <div class=\"flex flex-col md:flex-row gap-8\">\n                <!-- Navigation for Math Steps -->\n                <div class=\"md:w-1\/3 flex flex-col space-y-2 border-l-2 border-stone-200 pl-4\">\n                    <button class=\"math-nav-btn text-left px-4 py-3 rounded-r-lg font-bold transition-all text-teal-700 bg-teal-50 border-l-4 -ml-4 border-teal-600\" data-target=\"math-step-1\">\n                        Step 1: \u5b9a\u7fa9\u516c\u5f0f\n                    <\/button>\n                    <button class=\"math-nav-btn text-left px-4 py-3 rounded-r-lg font-medium text-stone-500 hover:text-stone-800 hover:bg-stone-100 transition-all border-l-4 -ml-4 border-transparent\" data-target=\"math-step-2\">\n                        Step 2: \u504f\u5fae\u5206\u8207\u93c8\u9396\u5f8b\n                    <\/button>\n                    <button class=\"math-nav-btn text-left px-4 py-3 rounded-r-lg font-medium text-stone-500 hover:text-stone-800 hover:bg-stone-100 transition-all border-l-4 -ml-4 border-transparent\" data-target=\"math-step-3\">\n                        Step 3: \u62ef\u6551\u68af\u5ea6\u6d88\u5931\n                    <\/button>\n                <\/div>\n\n                <!-- Math Content Area -->\n                <div class=\"md:w-2\/3 bg-white p-6 md:p-8 rounded-2xl shadow-sm border border-stone-200 min-h-[400px]\">\n                    \n                    <!-- Step 1 -->\n                    <div id=\"math-step-1\" class=\"math-step-content active\">\n                        <h3 class=\"text-xl font-bold text-stone-800 mb-4\">\u91cd\u65b0\u5b9a\u7fa9 Attention Logits<\/h3>\n                        <p class=\"text-stone-600 mb-4\">\n                            \u4ee4 $M^{(l)}$ \u70ba\u7b2c $l$ \u5c64\u7684 Attention Logits\uff08\u5373 Softmax \u524d\u7684\u77e9\u9663\uff09\u3002\u5728\u6a19\u6e96 Transformer \u4e2d\uff1a\n                        <\/p>\n                        <div class=\"overflow-x-auto text-center py-2 mb-4 font-mono text-stone-700\">\n                            $$M^{(l)}_{std} = \\frac{Q^{(l)} (K^{(l)})^T}{\\sqrt{d_k}}$$\n                        <\/div>\n                        <p class=\"text-stone-600 mb-4\">\n                            \u5f15\u5165 <strong>AttnRes<\/strong> \u5f8c\uff0c\u6211\u5011\u5c07\u4e0a\u4e00\u5c64\u7684 Logits $M^{(l-1)}$ \u4ee5\u6b0a\u91cd $\\alpha$ (\u901a\u5e38\u8a2d\u70ba\u53ef\u5b78\u7fd2\u53c3\u6578\u6216\u5e38\u6578) \u52a0\u5230\u7576\u524d\u5c64\uff1a\n                        <\/p>\n                        <div class=\"overflow-x-auto text-center py-4 bg-teal-50\/50 rounded-lg border border-teal-100 mb-4 font-mono text-teal-900 font-bold\">\n                            $$M^{(l)} = \\frac{Q^{(l)} (K^{(l)})^T}{\\sqrt{d_k}} + \\alpha M^{(l-1)}$$\n                        <\/div>\n                        <p class=\"text-stone-600 text-sm\">\n                            \u6700\u7d42\u7684\u6ce8\u610f\u529b\u77e9\u9663\u4f9d\u7136\u662f\uff1a $A^{(l)} = \\text{Softmax}(M^{(l)})$\u3002\u9019\u5c0f\u5c0f\u7684\u6539\u8b8a\uff0c\u537b\u5728\u6c42\u5c0e\u6642\u767c\u751f\u4e86\u8cea\u8b8a\u3002\n                        <\/p>\n                    <\/div>\n\n                    <!-- Step 2 -->\n                    <div id=\"math-step-2\" class=\"math-step-content\">\n                        <h3 class=\"text-xl font-bold text-stone-800 mb-4\">\u53cd\u5411\u50b3\u64ad\uff1a\u89e3\u6790\u504f\u5fae\u5206<\/h3>\n                        <p class=\"text-stone-600 mb-4\">\n                            \u5047\u8a2d\u640d\u5931\u51fd\u6578\u70ba $L$\u3002\u7576\u6211\u5011\u8981\u8a08\u7b97\u68af\u5ea6 $\\frac{\\partial L}{\\partial M^{(l-1)}}$ \u6642\uff0c\u6839\u64da\u5fae\u7a4d\u5206\u7684\u93c8\u9396\u5f8b (Chain Rule)\uff1a\n                        <\/p>\n                        <div class=\"overflow-x-auto text-center py-4 mb-4 font-mono text-stone-800 bg-stone-50 rounded\">\n                            $$\\frac{\\partial L}{\\partial M^{(l-1)}} = \\frac{\\partial L}{\\partial M^{(l)}} \\cdot \\frac{\\partial M^{(l)}}{\\partial M^{(l-1)}}$$\n                        <\/div>\n                        <p class=\"text-stone-600 mb-4\">\n                            \u56e0\u70ba $M^{(l)} = \\frac{Q^{(l)}K^{(l)T}}{\\sqrt{d}} + \\alpha M^{(l-1)}$\uff0c\u6211\u5011\u53ef\u4ee5\u8f15\u6613\u7b97\u51fa\u5176\u5c0d $M^{(l-1)}$ \u7684\u504f\u5c0e\uff1a\n                        <\/p>\n                        <div class=\"overflow-x-auto text-center py-4 mb-4 font-mono text-teal-800 font-bold text-xl\">\n                            $$\\frac{\\partial M^{(l)}}{\\partial M^{(l-1)}} = \\alpha \\cdot \\mathbf{I}$$\n                        <\/div>\n                        <p class=\"text-stone-600 text-sm\">\n                            \u9019\u88e1\u7684 $\\mathbf{I}$ \u662f\u55ae\u4f4d\u77e9\u9663\u3002\u9019\u610f\u5473\u8457\u68af\u5ea6\u53ef\u4ee5\u76f4\u63a5\u4e58\u4e0a $\\alpha$ \u7121\u640d\u5730\u6d41\u5411\u4e0a\u4e00\u5c64\uff01\n                        <\/p>\n                    <\/div>\n\n                    <!-- Step 3 -->\n                    <div id=\"math-step-3\" class=\"math-step-content\">\n                        <h3 class=\"text-xl font-bold text-stone-800 mb-4\">\u7d50\u8ad6\uff1a\u68af\u5ea6\u9ad8\u901f\u516c\u8def\u5efa\u7acb<\/h3>\n                        <p class=\"text-stone-600 mb-4\">\n                            \u5c07 Step 2 \u7684\u7d50\u679c\u4ee3\u56de\uff0c\u5b8c\u6574\u7684\u68af\u5ea6\u50b3\u905e\u516c\u5f0f\u8b8a\u6210\uff1a\n                        <\/p>\n                        <div class=\"overflow-x-auto text-center py-4 bg-teal-50\/50 border border-teal-100 rounded mb-4 font-mono text-stone-800\">\n                            $$\\frac{\\partial L}{\\partial M^{(l-1)}} = \\frac{\\partial L}{\\partial M^{(l)}} \\cdot \\alpha + (\\text{\u4f86\u81ea } Q, K \\text{ \u8def\u5f91\u7684\u8907\u96dc\u68af\u5ea6})$$\n                        <\/div>\n                        <div class=\"space-y-4\">\n                            <div class=\"flex items-start\">\n                                <span class=\"text-red-500 mr-2\">\u274c<\/span>\n                                <p class=\"text-sm text-stone-600\"><strong>\u50b3\u7d71\u67b6\u69cb\uff1a<\/strong> \u53ea\u6709\u5f8c\u9762\u90a3\u9805\u300c\u8907\u96dc\u68af\u5ea6\u300d\uff0c\u5b83\u5fc5\u9808\u7d93\u904e Softmax \u7684\u5fae\u5206\u3002\u7576 Softmax \u98fd\u548c\u6642\uff0c\u8a72\u9805\u8da8\u8fd1\u65bc 0\uff0c\u68af\u5ea6\u5fb9\u5e95\u6d88\u5931\u3002<\/p>\n                            <\/div>\n                            <div class=\"flex items-start\">\n                                <span class=\"text-teal-600 mr-2\">\u2705<\/span>\n                                <p class=\"text-sm text-stone-600\"><strong>AttnRes \u67b6\u69cb\uff1a<\/strong> \u591a\u51fa\u4e86 $\\frac{\\partial L}{\\partial M^{(l)}} \\cdot \\alpha$ \u9019\u689d\u76f4\u63a5\u901a\u9053\u3002\u5373\u4f7f Softmax \u98fd\u548c\uff0c\u68af\u5ea6\u4f9d\u7136\u80fd\u7a69\u7a69\u5730\u4ee5\u6bd4\u4f8b $\\alpha$ \u7a7f\u900f\u56de\u6dfa\u5c64\u3002<\/p>\n                            <\/div>\n                        <\/div>\n                        <p class=\"mt-6 text-stone-700 font-medium\">\n                            \u9019\u5c31\u662f\u70ba\u4ec0\u9ebc\u52a0\u5165 AttnRes \u5f8c\uff0c\u6a21\u578b\u53ef\u4ee5\u8f15\u6613\u8a13\u7df4\u5230\u4e0a\u767e\u5c64\uff0c\u4e14\u5728\u8655\u7406\u8d85\u9577\u6587\u672c (Long Context) \u6642\u6ce8\u610f\u529b\u6a5f\u5236\u4e0d\u6703\u5d29\u584c\u7684\u6578\u5b78\u672c\u8cea\u3002\n                        <\/p>\n                    <\/div>\n\n                <\/div>\n            <\/div>\n        <\/section>\n\n        <!-- Section: Metrics \/ Performance -->\n        <section id=\"metrics\" class=\"scroll-mt-24\">\n            <header class=\"mb-10 text-center\">\n                <h2 class=\"text-3xl font-bold text-stone-900 mb-4\">\u6548\u80fd\u5be6\u8b49\uff1a\u8cc7\u6599\u6703\u8aaa\u8a71<\/h2>\n                <p class=\"text-stone-600 max-w-2xl mx-auto\">\n                    \u900f\u904e\u8996\u89ba\u5316\u5716\u8868\uff0c\u89c0\u5bdf\u52a0\u5165\u4e86 AttnRes \u6a5f\u5236\u5f8c\uff0c\u5c0d\u6a21\u578b\u8a13\u7df4\u6536\u6582\u901f\u5ea6\u4ee5\u53ca\u6df1\u5c64\u7db2\u8def\u68af\u5ea6\u7684\u4fdd\u8b77\u4f5c\u7528\u3002\uff08\u6b64\u70ba\u57fa\u65bc\u7406\u8ad6\u9810\u671f\u7684\u6a21\u64ec\u6578\u64da\u5206\u6790\uff09\n                <\/p>\n            <\/header>\n\n            <div class=\"grid lg:grid-cols-2 gap-10\">\n                <!-- Chart 1: Training Loss Convergence -->\n                <div class=\"bg-white p-6 rounded-2xl border border-stone-200 shadow-sm flex flex-col\">\n                    <h3 class=\"text-lg font-bold text-stone-800 mb-2\">\u6975\u6df1\u5c64\u7db2\u8def (100+ Layers) \u8a13\u7df4 Loss \u6bd4\u8f03<\/h3>\n                    <p class=\"text-xs text-stone-500 mb-6\">AttnRes \u6709\u6548\u7de9\u89e3\u68af\u5ea6\u6d88\u5931\uff0c\u4f7f\u6df1\u5c64\u7db2\u8def\u80fd\u5728\u66f4\u5c11\u7684 Step \u5167\u6536\u6582\u3002<\/p>\n                    <!-- Chart Container following exact styling requirements -->\n                    <div class=\"chart-container flex-grow\">\n                        <canvas id=\"lossChart\"><\/canvas>\n                    <\/div>\n                <\/div>\n\n                <!-- Chart 2: Gradient Norm across layers -->\n                <div class=\"bg-white p-6 rounded-2xl border border-stone-200 shadow-sm flex flex-col\">\n                    <h3 class=\"text-lg font-bold text-stone-800 mb-2\">\u53cd\u5411\u50b3\u64ad\u68af\u5ea6\u7bc4\u6578 (Gradient Norm) \u8870\u6e1b\u60c5\u5f62<\/h3>\n                    <p class=\"text-xs text-stone-500 mb-6\">\u5f9e\u7b2c 100 \u5c64\u53cd\u50b3\u81f3\u7b2c 1 \u5c64\u6642\uff0c\u50b3\u7d71 Attention \u7684\u68af\u5ea6\u5e7e\u4e4e\u6b78\u96f6\uff0c\u800c AttnRes \u4fdd\u6301\u5065\u5eb7\u6c34\u6e96\u3002<\/p>\n                    <!-- Chart Container following exact styling requirements -->\n                    <div class=\"chart-container flex-grow\">\n                        <canvas id=\"gradientChart\"><\/canvas>\n                    <\/div>\n                <\/div>\n            <\/div>\n        <\/section>\n        \n        <!-- Footer summary -->\n        <footer class=\"mt-12 border-t border-stone-200 pt-8 pb-12 text-center\">\n            <p class=\"text-stone-500 text-sm\">\n                \u7e3d\u7d50\uff1aAttnRes (Residual Attention) \u662f\u4e00\u7a2e\u512a\u96c5\u4e14\u6578\u5b78\u57fa\u790e\u7a69\u56fa\u7684\u67b6\u69cb\u6539\u9032\u3002<br>\n                \u5728\u4e0d\u986f\u8457\u589e\u52a0\u8a08\u7b97\u91cf\u7684\u524d\u63d0\u4e0b\uff0c\u70ba\u9577\u6587\u672c\u3001\u6df1\u5c64\u6b21\u5927\u8a9e\u8a00\u6a21\u578b (LLMs) \u958b\u95e2\u4e86\u68af\u5ea6\u7684\u9ad8\u901f\u516c\u8def\u3002\n            <\/p>\n        <\/footer>\n\n    <\/main>\n\n    <script>\n        \/\/ DOM Elements for Architecture Toggle\n        const btnStd = document.getElementById('btn-std-arch');\n        const btnRes = document.getElementById('btn-res-arch');\n        const archStd = document.getElementById('arch-std');\n        const archRes = document.getElementById('arch-res');\n\n        \/\/ Toggle Architecture Views\n        btnStd.addEventListener('click', () => {\n            archStd.classList.remove('hidden');\n            archRes.classList.add('hidden');\n            \n            \/\/ Update button styles\n            btnStd.className = \"px-6 py-2 rounded-md font-medium text-sm transition-colors bg-teal-600 text-white shadow\";\n            btnRes.className = \"px-6 py-2 rounded-md font-medium text-sm transition-colors text-stone-400 hover:text-white\";\n        });\n\n        btnRes.addEventListener('click', () => {\n            archStd.classList.add('hidden');\n            archRes.classList.remove('hidden');\n            \n            \/\/ Update button styles\n            btnRes.className = \"px-6 py-2 rounded-md font-medium text-sm transition-colors bg-teal-600 text-white shadow\";\n            btnStd.className = \"px-6 py-2 rounded-md font-medium text-sm transition-colors text-stone-400 hover:text-white\";\n        });\n\n        \/\/ Interactive Math Steps Navigation\n        const mathNavBtns = document.querySelectorAll('.math-nav-btn');\n        const mathContents = document.querySelectorAll('.math-step-content');\n\n        mathNavBtns.forEach(btn => {\n            btn.addEventListener('click', () => {\n                \/\/ Reset all buttons\n                mathNavBtns.forEach(b => {\n                    b.classList.remove('text-teal-700', 'bg-teal-50', 'border-teal-600');\n                    b.classList.add('text-stone-500', 'border-transparent');\n                });\n                \n                \/\/ Activate clicked button\n                btn.classList.add('text-teal-700', 'bg-teal-50', 'border-teal-600');\n                btn.classList.remove('text-stone-500', 'border-transparent');\n\n                \/\/ Hide all contents\n                mathContents.forEach(content => {\n                    content.classList.remove('active');\n                });\n\n                \/\/ Show target content\n                const targetId = btn.getAttribute('data-target');\n                document.getElementById(targetId).classList.add('active');\n            });\n        });\n\n        \/\/ Scroll Spy for Main Navigation\n        const sections = document.querySelectorAll(\"section\");\n        const navLinks = document.querySelectorAll(\".nav-link\");\n\n        window.addEventListener(\"scroll\", () => {\n            let current = \"\";\n            sections.forEach((section) => {\n                const sectionTop = section.offsetTop;\n                if (scrollY >= sectionTop - 150) {\n                    current = section.getAttribute(\"id\");\n                }\n            });\n\n            navLinks.forEach((link) => {\n                link.classList.remove(\"active\", \"text-teal-600\", \"border-b-2\", \"border-teal-600\");\n                if (link.getAttribute(\"href\").includes(current) && current !== \"\") {\n                    link.classList.add(\"active\", \"text-teal-600\", \"border-b-2\", \"border-teal-600\");\n                }\n            });\n        });\n\n        \/\/ Initialize Chart.js when DOM is loaded\n        document.addEventListener('DOMContentLoaded', function() {\n            \n            \/\/ Common Chart Options for Clean UI\n            const commonOptions = {\n                responsive: true,\n                maintainAspectRatio: false, \/\/ CRITICAL for filling the .chart-container\n                plugins: {\n                    legend: {\n                        position: 'bottom',\n                        labels: {\n                            font: { family: 'Inter', size: 12 },\n                            usePointStyle: true,\n                            boxWidth: 8\n                        }\n                    },\n                    tooltip: {\n                        backgroundColor: 'rgba(28, 25, 23, 0.9)',\n                        titleFont: { family: 'Inter', size: 13 },\n                        bodyFont: { family: 'Inter', size: 13 },\n                        padding: 10,\n                        cornerRadius: 4,\n                        callbacks: {\n                            label: function(context) {\n                                let label = context.dataset.label || '';\n                                if (label) label += ': ';\n                                if (context.parsed.y !== null) {\n                                    label += context.parsed.y.toFixed(4); \/\/ Keep decimals for math context\n                                }\n                                return label;\n                            }\n                        }\n                    }\n                },\n                scales: {\n                    x: {\n                        grid: { display: false, drawBorder: false },\n                        ticks: { font: { family: 'Inter' } }\n                    },\n                    y: {\n                        grid: { color: '#e7e5e4', borderDash: [4, 4], drawBorder: false },\n                        ticks: { font: { family: 'Inter' } }\n                    }\n                }\n            };\n\n            \/\/ Data Generation: Simulated Training Loss\n            const steps = Array.from({length: 20}, (_, i) => i * 100);\n            const lossStd = steps.map(s => 10 * Math.exp(-s\/500) + 2); \/\/ Slow, plateaus higher\n            const lossRes = steps.map(s => 10 * Math.exp(-s\/200) + 0.5); \/\/ Fast, plateaus lower\n\n            \/\/ Chart 1: Line Chart (Loss)\n            const ctxLoss = document.getElementById('lossChart').getContext('2d');\n            new Chart(ctxLoss, {\n                type: 'line',\n                data: {\n                    labels: steps.map(s => s + 'k'),\n                    datasets: [\n                        {\n                            label: 'Standard Attention',\n                            data: lossStd,\n                            borderColor: '#a8a29e', \/\/ stone-400\n                            backgroundColor: 'rgba(168, 162, 158, 0.1)',\n                            borderWidth: 2,\n                            tension: 0.4,\n                            fill: true,\n                            pointRadius: 0\n                        },\n                        {\n                            label: 'AttnRes (\u6b98\u5dee\u6a5f\u5236)',\n                            data: lossRes,\n                            borderColor: '#0d9488', \/\/ teal-600\n                            backgroundColor: 'rgba(13, 148, 136, 0.1)',\n                            borderWidth: 3,\n                            tension: 0.4,\n                            fill: true,\n                            pointRadius: 0\n                        }\n                    ]\n                },\n                options: {\n                    ...commonOptions,\n                    scales: {\n                        ...commonOptions.scales,\n                        y: { ...commonOptions.scales.y, title: { display: true, text: 'Training Loss' } },\n                        x: { ...commonOptions.scales.x, title: { display: true, text: 'Training Steps' } }\n                    }\n                }\n            });\n\n            \/\/ Data Generation: Gradient Norm across layers (100 -> 1, backward pass)\n            const layers = [100, 80, 60, 40, 20, 1]; \/\/ Reverse order roughly representing backprop distance\n            \/\/ Standard decays exponentially during backprop\n            const gradStd = layers.map(l => Math.pow(0.8, (100 - l)\/10)); \n            \/\/ AttnRes maintains norm much better due to the bypass\n            const gradRes = layers.map(l => Math.pow(0.98, (100 - l)\/10));\n\n            \/\/ Chart 2: Bar Chart (Gradient Flow)\n            const ctxGrad = document.getElementById('gradientChart').getContext('2d');\n            new Chart(ctxGrad, {\n                type: 'bar',\n                data: {\n                    labels: layers.map(l => `Layer ${l}`),\n                    datasets: [\n                        {\n                            label: 'Standard Attention',\n                            data: gradStd,\n                            backgroundColor: '#d6d3d1', \/\/ stone-300\n                            borderRadius: 4\n                        },\n                        {\n                            label: 'AttnRes (\u6b98\u5dee\u6a5f\u5236)',\n                            data: gradRes,\n                            backgroundColor: '#0f766e', \/\/ teal-700\n                            borderRadius: 4\n                        }\n                    ]\n                },\n                options: {\n                    ...commonOptions,\n                    plugins: {\n                        ...commonOptions.plugins,\n                        tooltip: {\n                            ...commonOptions.plugins.tooltip,\n                            callbacks: {\n                                label: function(context) {\n                                    return context.dataset.label + ': ' + context.parsed.y.toFixed(2);\n                                }\n                            }\n                        }\n                    },\n                    scales: {\n                        ...commonOptions.scales,\n                        y: { \n                            ...commonOptions.scales.y, \n                            title: { display: true, text: 'Relative Gradient Norm (Log Scale)' },\n                            type: 'logarithmic', \/\/ Use log scale to show the vanishing effect clearly\n                            min: 0.01,\n                            ticks: {\n                                callback: function(value, index, values) {\n                                    if(value === 1 || value === 0.1 || value === 0.01) return value;\n                                    return null;\n                                }\n                            }\n                        },\n                        x: { ...commonOptions.scales.x, title: { display: true, text: '\u53cd\u5411\u50b3\u64ad\u62b5\u9054\u5c64\u6578 (100\u70ba\u6700\u6df1\u5c64)' } }\n                    }\n                }\n            });\n        });\n    <\/script>\n<\/body>\n<\/html>\n","protected":false},"excerpt":{"rendered":"<p>\u6df1\u5165\u6dfa\u51fa Kimi AttnRes\uff1a\u6ce8\u610f\u529b\u6b98\u5dee\u6a5f\u5236\u7684\u6578\u5b78\u8207\u76f4\u89ba Kimi Att &hellip; <a href=\"https:\/\/ouyangminwei.com\/index.php\/2026\/03\/23\/kimi-attnres\/\">\u95b1\u8b80\u5168\u6587 <span class=\"meta-nav\">&rarr;<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[1],"tags":[],"post_format":[],"class_list":["post-1082","post","type-post","status-publish","format-standard","hentry","category-uncategorized"],"_edit_lock":"1774247434:1","_edit_last":"1","_aioseo_title":"#post_title #separator_sa #site_title","_aioseo_description":"#post_excerpt","_aioseo_keywords":"","_aioseo_og_title":"","_aioseo_og_description":"","_aioseo_og_article_section":"","_aioseo_og_article_tags":"","_aioseo_twitter_title":"","_aioseo_twitter_description":"","_oembed_2544c1d0cb3503ab4c4d558c3b3c8873":"","_oembed_time_2544c1d0cb3503ab4c4d558c3b3c8873":"","_oembed_99481806ecbe6ce4ee46f8588d320993":"","_oembed_db663acf973e82e6d9d80df71945dfb8":"","_oembed_16cdfab488f57db73586f4286af2704f":"","_wp_old_slug":"","_links":{"self":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/1082","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/comments?post=1082"}],"version-history":[{"count":1,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/1082\/revisions"}],"predecessor-version":[{"id":1083,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/1082\/revisions\/1083"}],"wp:attachment":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/media?parent=1082"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/categories?post=1082"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/tags?post=1082"},{"taxonomy":"post_format","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/post_format?post=1082"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}