{"id":880,"date":"2025-07-31T01:54:00","date_gmt":"2025-07-31T01:54:00","guid":{"rendered":"https:\/\/ouyangminwei.com\/?p=880"},"modified":"2025-07-18T09:55:31","modified_gmt":"2025-07-18T09:55:31","slug":"dqn","status":"publish","type":"post","link":"https:\/\/ouyangminwei.com\/index.php\/2025\/07\/31\/dqn\/","title":{"rendered":"\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u6f14\u5316"},"content":{"rendered":"\n<h1 class=\"wp-block-heading\">\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u6f14\u5316\uff1a\u5f9e\u50f9\u503c\u51fd\u6578\u5230\u4e16\u754c\u6a21\u578b\u8207\u6a21\u4eff\u5b78\u7fd2<\/h1>\n\n\n\n<h2 class=\"wp-block-heading\">\u5c0e\u8ad6<\/h2>\n\n\n\n<p>\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\uff08Deep Reinforcement Learning, DRL\uff09\u662f\u4eba\u5de5\u667a\u6167\u9818\u57df\u4e2d\u6700\u5177\u7a81\u7834\u6027\u7684\u5206\u652f\u4e4b\u4e00\uff0c\u5b83\u5c07\u6df1\u5ea6\u5b78\u7fd2\u7684\u611f\u77e5\u8207\u8868\u5fb5\u80fd\u529b\u8207\u5f37\u5316\u5b78\u7fd2\u7684\u6c7a\u7b56\u5236\u5b9a\u6846\u67b6\u76f8\u7d50\u5408\uff0c\u8ce6\u4e88\u4e86\u667a\u80fd\u9ad4\uff08agent\uff09\u5728\u8907\u96dc\u3001\u9ad8\u7dad\u5ea6\u74b0\u5883\u4e2d\u5f9e\u96f6\u958b\u59cb\u5b78\u7fd2\u6700\u4f73\u7b56\u7565\u7684\u80fd\u529b <sup><\/sup>\u3002\u5f9e\u5728\u96c5\u9054\u5229\uff08Atari\uff09\u904a\u6232\u4e2d\u8d85\u8d8a\u4eba\u985e\u5c08\u5bb6\u6c34\u5e73 <sup><\/sup>\uff0c\u5230\u5728\u570d\u68cb\u7b49\u7b56\u7565\u904a\u6232\u4e2d\u9054\u5230\u5dd4\u5cf0\uff0cDRL \u7684\u767c\u5c55\u6b77\u7a0b\u4e0d\u50c5\u662f\u4e00\u7cfb\u5217\u6f14\u7b97\u6cd5\u7684\u8fed\u4ee3\uff0c\u66f4\u662f\u4e00\u5834\u95dc\u65bc\u5982\u4f55\u6709\u6548\u5b78\u7fd2\u3001\u7a69\u5b9a\u8a13\u7df4\u4ee5\u53ca\u5728\u4e0d\u540c\u8cc7\u8a0a\u689d\u4ef6\u4e0b\u505a\u51fa\u6c7a\u7b56\u7684\u601d\u60f3\u6f14\u5316\u3002 &nbsp;<\/p>\n\n\n\n<p>\u672c\u5831\u544a\u65e8\u5728\u6df1\u5165\u5256\u6790\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u5f9e\u5176 foundational breakthrough \u5230\u7576\u524d\u524d\u6cbf\u7814\u7a76\u7684\u6f14\u5316\u8def\u5f91\u3002\u6211\u5011\u5c07\u8ffd\u6eaf\u9019\u689d\u5f9e\u57fa\u790e\u7684 Deep Q-Learning\uff08DQN\uff09\u958b\u59cb\uff0c\u9014\u7d93\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\uff08Policy Gradient Methods\uff09\u7684\u6210\u719f\u4ee3\u8868 Proximal Policy Optimization\uff08PPO\uff09\uff0c\u518d\u5230\u5f15\u5165\u74b0\u5883\u5167\u90e8\u6a21\u578b\u7684 World Models\uff0c\u6700\u7d42\u63a2\u8a0e\u5728\u7f3a\u4e4f\u660e\u78ba\u734e\u52f5\u8a0a\u865f\u4e0b\u5982\u4f55\u5b78\u7fd2\u7684\u6a21\u4eff\u5b78\u7fd2\uff08Imitation Learning\uff09\u7bc4\u5f0f\u3002\u6b64\u5206\u6790\u5c07\u4e0d\u50c5\u50c5\u662f\u6f14\u7b97\u6cd5\u7684\u7f85\u5217\uff0c\u800c\u662f\u8457\u91cd\u65bc\u63ed\u793a\u6bcf\u4e00\u6b21\u6f14\u9032\u80cc\u5f8c\u7684\u52d5\u6a5f\uff1a\u524d\u4ee3\u6f14\u7b97\u6cd5\u9047\u5230\u4e86\u4f55\u7a2e\u74f6\u9838\uff1f\u65b0\u7684\u601d\u60f3\u548c\u6a5f\u5236\u5982\u4f55\u89e3\u6c7a\u9019\u4e9b\u554f\u984c\uff1f\u4ee5\u53ca\u9019\u4e9b\u5275\u65b0\u5982\u4f55\u5171\u540c\u5851\u9020\u4e86\u6211\u5011\u4eca\u5929\u6240\u77e5\u7684 DRL \u9818\u57df\u3002<\/p>\n\n\n\n<p>\u70ba\u4e86\u7d66\u8b80\u8005\u63d0\u4f9b\u4e00\u500b\u6e05\u6670\u7684\u5b8f\u89c0\u8996\u89d2\uff0c\u4e0b\u8868\u6982\u8ff0\u4e86\u672c\u5831\u544a\u5c07\u8981\u63a2\u8a0e\u7684\u4e3b\u8981 DRL \u7bc4\u5f0f\u53ca\u5176\u6838\u5fc3\u7279\u5fb5\u3002<\/p>\n\n\n\n<p><strong>\u8868 1\uff1a\u4e3b\u8981\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7bc4\u5f0f\u5c0d\u6bd4\u5206\u6790<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><td>\u7bc4\u5f0f (Paradigm)<\/td><td>\u95dc\u9375\u6f14\u7b97\u6cd5 (Key Algorithms)<\/td><td>\u6838\u5fc3\u539f\u7406 (Core Principle)<\/td><td>\u6a23\u672c\u6548\u7387 (Sample Efficiency)<\/td><td>\u7a69\u5b9a\u6027 (Stability)<\/td><td>\u9023\u7e8c\u52d5\u4f5c\u7a7a\u9593\u8655\u7406 (Continuous Actions)<\/td><td>\u734e\u52f5\u51fd\u6578\u9700\u6c42 (Reward Function)<\/td><\/tr><\/thead><tbody><tr><td><strong>\u50f9\u503c\u57fa\u790e (Value-Based)<\/strong><\/td><td>DQN, Double DQN, Dueling DQN<\/td><td>\u5b78\u7fd2\u4e00\u500b\u50f9\u503c\u51fd\u6578\uff08Q-function\uff09\u4f86\u8a55\u4f30\u5728\u7279\u5b9a\u72c0\u614b\u4e0b\u63a1\u53d6\u6bcf\u500b\u52d5\u4f5c\u7684\u597d\u58de\uff0c\u7b56\u7565\u662f\u96b1\u542b\u5730\u9078\u64c7\u50f9\u503c\u6700\u9ad8\u7684\u52d5\u4f5c\u3002<\/td><td>\u8f03\u4f4e\uff08Off-policy \u7279\u6027\u6709\u52a9\u65bc\u91cd\u7528\u6578\u64da\uff09<\/td><td>\u4e2d\u7b49\uff08\u6613\u53d7\u4e0d\u7a69\u5b9a\u6027\u5f71\u97ff\uff0c\u9700\u7279\u6b8a\u6a5f\u5236\uff09<\/td><td>\u56f0\u96e3\uff08\u9700\u8981\u5c0d\u52d5\u4f5c\u7a7a\u9593\u9032\u884c\u96e2\u6563\u5316\uff09<\/td><td>\u5fc5\u8981<\/td><\/tr><tr><td><strong>\u7b56\u7565\u57fa\u790e (Policy-Based)<\/strong><\/td><td>REINFORCE, TRPO, PPO<\/td><td>\u76f4\u63a5\u5b78\u7fd2\u4e00\u500b\u53c3\u6578\u5316\u7684\u7b56\u7565\uff08Policy\uff09\uff0c\u8a72\u7b56\u7565\u5c07\u72c0\u614b\u6620\u5c04\u5230\u52d5\u4f5c\uff08\u6216\u52d5\u4f5c\u7684\u6a5f\u7387\u5206\u4f48\uff09\u3002<\/td><td>\u4f4e\uff08On-policy \u7279\u6027\u901a\u5e38\u5c0e\u81f4\u6a23\u672c\u6548\u7387\u4f4e\u4e0b\uff09<\/td><td>\u8f03\u4f4e\uff08\u68af\u5ea6\u4f30\u8a08\u8b8a\u7570\u6578\u9ad8\uff09<\/td><td>\u826f\u597d\uff08\u76f4\u63a5\u8f38\u51fa\u52d5\u4f5c\u5206\u4f48\uff09<\/td><td>\u5fc5\u8981<\/td><\/tr><tr><td><strong>\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6 (Actor-Critic)<\/strong><\/td><td>A2C, A3C, DDPG, SAC<\/td><td>\u7d50\u5408\u50f9\u503c\u57fa\u790e\u548c\u7b56\u7565\u57fa\u790e\u65b9\u6cd5\u3002\u884c\u52d5\u8005\uff08Actor\uff09\u8ca0\u8cac\u6c7a\u7b56\uff0c\u8a55\u8ad6\u5bb6\uff08Critic\uff09\u8ca0\u8cac\u8a55\u4f30\u884c\u52d5\u8005\u7684\u6c7a\u7b56\uff0c\u4ee5\u964d\u4f4e\u68af\u5ea6\u8b8a\u7570\u6578\u3002<\/td><td>\u4e2d\u7b49<\/td><td>\u4e2d\u9ad8\uff08Critic \u964d\u4f4e\u4e86\u8b8a\u7570\u6578\uff09<\/td><td>\u826f\u597d<\/td><td>\u5fc5\u8981<\/td><\/tr><tr><td><strong>\u6a21\u578b\u57fa\u790e (Model-Based)<\/strong><\/td><td>World Models, Dreamer<\/td><td>\u5b78\u7fd2\u4e00\u500b\u74b0\u5883\u7684\u52d5\u614b\u6a21\u578b\uff0c\u4e26\u5229\u7528\u8a72\u6a21\u578b\u9032\u884c\u898f\u5283\u6216\u751f\u6210\u6a21\u64ec\u7d93\u9a57\u4f86\u8a13\u7df4\u7b56\u7565\u3002<\/td><td>\u9ad8\uff08\u53ef\u900f\u904e\u6a21\u578b\u751f\u6210\u5927\u91cf\u6a21\u64ec\u6578\u64da\uff09<\/td><td>\u53d6\u6c7a\u65bc\u6a21\u578b\u6e96\u78ba\u6027<\/td><td>\u826f\u597d<\/td><td>\u5fc5\u8981<\/td><\/tr><tr><td><strong>\u6a21\u4eff\u5b78\u7fd2 (Imitation Learning)<\/strong><\/td><td>Behavior Cloning, DAgger, IRL, GAIL<\/td><td>\u5728\u6c92\u6709\u734e\u52f5\u51fd\u6578\u7684\u60c5\u6cc1\u4e0b\uff0c\u5f9e\u5c08\u5bb6\u793a\u7bc4\u4e2d\u5b78\u7fd2\u7b56\u7565\u3002<\/td><td>\u9ad8\uff08\u76f4\u63a5\u5229\u7528\u5c08\u5bb6\u6578\u64da\uff09<\/td><td>\u4e2d\u7b49\uff08\u9762\u81e8\u5171\u8b8a\u6578\u504f\u79fb\u7b49\u554f\u984c\uff09<\/td><td>\u53d6\u6c7a\u65bc\u5177\u9ad4\u6f14\u7b97\u6cd5<\/td><td>\u4e0d\u9700\u8981\uff08\u5f9e\u793a\u7bc4\u4e2d\u63a8\u65b7\uff09<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p>\u532f\u51fa\u5230\u8a66\u7b97\u8868<\/p>\n\n\n\n<p>\u672c\u5831\u544a\u5c07\u4f9d\u5faa\u9019\u689d\u6f14\u5316\u8108\u7d61\uff0c\u6df1\u5165\u6bcf\u4e00\u500b\u7bc4\u5f0f\u7684\u5167\u90e8\uff0c\u63ed\u793a\u5176\u6a5f\u5236\u3001\u512a\u52e2\u8207\u6311\u6230\uff0c\u6700\u7d42\u63cf\u7e6a\u51fa\u4e00\u5e45\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u767c\u5c55\u7684\u5b8c\u6574\u5716\u666f\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u7b2c\u4e00\u90e8\u5206\uff1a\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u9ece\u660e &#8211; \u50f9\u503c\u57fa\u790e\u65b9\u6cd5<\/h2>\n\n\n\n<p>\u672c\u90e8\u5206\u5c07\u5960\u5b9a\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u57fa\u790e\u6027\u7a81\u7834\u2014\u2014\u6df1\u5ea6 Q \u7db2\u8def\uff08Deep Q-Networks, DQN\uff09\uff0c\u5b83\u6210\u529f\u5730\u5c07\u50b3\u7d71\u7684 Q-learning \u539f\u7406\u8207\u6df1\u5ea6\u795e\u7d93\u7db2\u8def\u7684\u5f37\u5927\u529f\u80fd\u7d50\u5408\u8d77\u4f86\u3002\u6211\u5011\u5c07\u5256\u6790\u5b83\u89e3\u6c7a\u7684\u6838\u5fc3\u554f\u984c\u3001\u5f15\u5165\u7684\u65b0\u6311\u6230\uff0c\u4ee5\u53ca\u96a8\u5f8c\u6e67\u73fe\u7684\u3001\u65e8\u5728\u5b8c\u5584\u5176\u6027\u80fd\u7684\u773e\u591a\u5275\u65b0\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1.1 \u5178\u7bc4\u8f49\u79fb\uff1a\u5f9e\u8868\u683c\u5f0f Q-Learning \u5230\u6df1\u5ea6 Q-\u7db2\u8def (DQN)<\/h3>\n\n\n\n<p>\u5728\u6df1\u5ea6\u5b78\u7fd2\u9769\u547d\u4e4b\u524d\uff0c\u5f37\u5316\u5b78\u7fd2\u9818\u57df\u4e3b\u8981\u7531\u8868\u683c\u5f0f\u65b9\u6cd5\u4e3b\u5c0e\uff0c\u5176\u4e2d Q-learning \u662f\u6700\u5177\u4ee3\u8868\u6027\u7684\u6f14\u7b97\u6cd5\u4e4b\u4e00\u3002\u5176\u6838\u5fc3\u601d\u60f3\u662f\uff0c\u667a\u80fd\u9ad4\u900f\u904e\u8207\u74b0\u5883\u4e92\u52d5\uff0c\u5b78\u7fd2\u4e26\u7dad\u8b77\u4e00\u500b\u7a31\u70ba Q-table \u7684\u8868\u683c\uff0c\u8a72\u8868\u683c\u5132\u5b58\u4e86\u5728\u6bcf\u500b\u53ef\u80fd\u72c0\u614b\uff08state\uff09\u4e0b\u63a1\u53d6\u6bcf\u500b\u53ef\u80fd\u52d5\u4f5c\uff08action\uff09\u6240\u80fd\u7372\u5f97\u7684\u9810\u671f\u7d2f\u7a4d\u56de\u5831\uff08Q-value\uff09<sup><\/sup>\u3002\u7136\u800c\uff0c\u9019\u7a2e\u65b9\u6cd5\u7684\u61c9\u7528\u7bc4\u570d\u53d7\u5230\u4e86\u5176\u6839\u672c\u6027\u9650\u5236\u7684\u56b4\u91cd\u675f\u7e1b\uff0c\u5373\u300c\u7dad\u5ea6\u707d\u96e3\u300d\uff08curse of dimensionality\uff09\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5c0d\u65bc\u72c0\u614b\u7a7a\u9593\u6216\u52d5\u4f5c\u7a7a\u9593\u9f90\u5927\u6216\u9023\u7e8c\u7684\u554f\u984c\uff0cQ-table \u7684\u898f\u6a21\u6703\u5448\u6307\u6578\u7d1a\u589e\u9577\uff0c\u4f7f\u5176\u5728\u8a08\u7b97\u4e0a\u548c\u8a18\u61b6\u9ad4\u4e0a\u90fd\u8b8a\u5f97\u4e0d\u53ef\u884c <sup><\/sup>\u3002\u4f8b\u5982\uff0c\u4e00\u500b\u64c1\u6709\u6578\u5343\u500b\u72c0\u614b\u548c\u6bcf\u500b\u72c0\u614b\u6578\u5343\u500b\u52d5\u4f5c\u7684\u904a\u6232\uff0c\u5c31\u9700\u8981\u4e00\u500b\u5305\u542b\u6578\u767e\u842c\u500b\u55ae\u5143\u683c\u7684 Q-table <sup><\/sup>\u3002\u66f4\u91cd\u8981\u7684\u662f\uff0c\u8868\u683c\u5f0f\u65b9\u6cd5\u7121\u6cd5\u5c07\u5f9e\u5df2\u898b\u72c0\u614b\u4e2d\u5b78\u5230\u7684\u77e5\u8b58\u63a8\u5ee3\u5230\u672a\u898b\u904e\u7684\u76f8\u4f3c\u72c0\u614b\uff0c\u667a\u80fd\u9ad4\u5fc5\u9808\u89aa\u8eab\u7d93\u6b77\u6bcf\u4e00\u500b\u72c0\u614b-\u52d5\u4f5c\u5c0d\u624d\u80fd\u5b78\u7fd2\u5176\u50f9\u503c\uff0c\u9019\u6975\u5927\u5730\u9650\u5236\u4e86\u5176\u5b78\u7fd2\u6548\u7387\u548c\u6cdb\u5316\u80fd\u529b <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>2013 \u5e74\uff0cDeepMind \u767c\u8868\u4e86\u4e00\u7bc7\u958b\u5275\u6027\u7684\u8ad6\u6587\uff0c\u6a19\u8a8c\u8457\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u6642\u4ee3\u7684\u5230\u4f86 <sup><\/sup>\u3002\u4ed6\u5011\u63d0\u51fa\u7684\u6df1\u5ea6 Q-\u7db2\u8def\uff08DQN\uff09\u9996\u6b21\u6210\u529f\u5730\u8b93\u4e00\u500b\u6df1\u5ea6\u5b78\u7fd2\u6a21\u578b\u76f4\u63a5\u5f9e\u9ad8\u7dad\u5ea6\u7684\u611f\u5b98\u8f38\u5165\uff08\u5982\u539f\u59cb\u50cf\u7d20\uff09\u4e2d\u5b78\u7fd2\u63a7\u5236\u7b56\u7565\u3002\u5176\u6838\u5fc3\u5275\u65b0\u5728\u65bc\u7528\u4e00\u500b**\u6df1\u5ea6\u795e\u7d93\u7db2\u8def\u4f5c\u70ba\u51fd\u6578\u903c\u8fd1\u5668\uff08function approximator\uff09**\u4f86\u53d6\u4ee3\u5de8\u5927\u7684 Q-table <sup><\/sup>\u3002\u9019\u500b\u7db2\u8def\u7684\u8f38\u5165\u662f\u74b0\u5883\u7684\u72c0\u614b\uff08\u4f8b\u5982\uff0c\u7d93\u904e\u9810\u8655\u7406\u7684\u9023\u7e8c\u5e7e\u5e40\u904a\u6232\u756b\u9762\uff09\uff0c\u8f38\u51fa\u5247\u662f\u4e00\u500b\u5411\u91cf\uff0c\u5176\u4e2d\u6bcf\u500b\u5143\u7d20\u5c0d\u61c9\u4e00\u500b\u53ef\u80fd\u52d5\u4f5c\u7684 Q-value <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u9019\u7a2e\u51fd\u6578\u903c\u8fd1\u7684\u65b9\u6cd5\u8ce6\u4e88\u4e86\u667a\u80fd\u9ad4\u5f37\u5927\u7684<strong>\u6cdb\u5316\u80fd\u529b<\/strong>\uff0c\u5f9e\u800c\u514b\u670d\u4e86\u7dad\u5ea6\u707d\u96e3\u3002\u795e\u7d93\u7db2\u8def\uff0c\u7279\u5225\u662f\u5377\u7a4d\u795e\u7d93\u7db2\u8def\uff08CNN\uff09\uff0c\u80fd\u5920\u81ea\u52d5\u5f9e\u9ad8\u7dad\u8f38\u5165\u4e2d\u63d0\u53d6\u6709\u610f\u7fa9\u7684\u7279\u5fb5\uff08\u4f8b\u5982\uff0c\u5f9e\u904a\u6232\u756b\u9762\u4e2d\u8b58\u5225\u51fa\u7403\u3001\u64cb\u677f\u6216\u6575\u4eba\uff09<sup><\/sup>\u3002\u9019\u4f7f\u5f97\u667a\u80fd\u9ad4\u80fd\u5920\u5c07\u5f9e\u67d0\u4e9b\u72c0\u614b\u4e2d\u5b78\u5230\u7684\u7d93\u9a57\u61c9\u7528\u65bc\u5f9e\u672a\u898b\u904e\u4f46\u7279\u5fb5\u76f8\u4f3c\u7684\u65b0\u72c0\u614b <sup><\/sup>\u3002\u9019\u5c31\u50cf\u4eba\u985e\u5b78\u7fd2\u4e0b\u68cb\u4e00\u6a23\uff0c\u4e0d\u662f\u53bb\u8a18\u61b6\u6bcf\u4e00\u500b\u53ef\u80fd\u7684\u68cb\u76e4\u4f48\u5c40\uff0c\u800c\u662f\u5b78\u7fd2\u901a\u7528\u7684\u7b56\u7565\u548c\u539f\u5247 <sup><\/sup>\u3002DQN \u7684\u51fa\u73fe\uff0c\u8b49\u660e\u4e86\u5c07\u6df1\u5ea6\u5b78\u7fd2\u7684\u8868\u5fb5\u80fd\u529b\u8207\u5f37\u5316\u5b78\u7fd2\u7684\u6c7a\u7b56\u6846\u67b6\u76f8\u7d50\u5408\u662f\u53ef\u884c\u7684\uff0c\u70ba\u89e3\u6c7a \u0440\u0430\u043d\u0435\u0435 \u68d8\u624b\u7684\u8907\u96dc\u554f\u984c\u958b\u95e2\u4e86\u5168\u65b0\u7684\u9053\u8def\u3002 &nbsp;<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1.2 \u99b4\u670d\u4e0d\u7a69\u5b9a\u6027\uff1a\u7d93\u9a57\u91cd\u64ad\u8207\u76ee\u6a19\u7db2\u8def\u7684\u6838\u5fc3\u6a5f\u5236<\/h3>\n\n\n\n<p>\u5118\u7ba1\u7528\u795e\u7d93\u7db2\u8def\u53d6\u4ee3 Q-table \u89e3\u6c7a\u4e86\u7dad\u5ea6\u554f\u984c\uff0c\u4f46\u9019\u7a2e\u7d50\u5408\u4e26\u975e\u4e00\u5e06\u98a8\u9806\u3002\u7c21\u55ae\u5730\u5c07\u5169\u8005\u7d50\u5408\u6703\u5c0e\u81f4\u8a13\u7df4\u904e\u7a0b\u6975\u5ea6\u4e0d\u7a69\u5b9a\u3002\u9019\u7a2e\u4e0d\u7a69\u5b9a\u6027\u6e90\u65bc\u5f37\u5316\u5b78\u7fd2\u8207\u6df1\u5ea6\u5b78\u7fd2\u57fa\u672c\u5047\u8a2d\u4e4b\u9593\u7684\u885d\u7a81\uff0c\u7279\u5225\u662f\u6240\u8b02\u7684\u300c\u6b7b\u4ea1\u4e09\u5143\u7d44\u300d\uff08Deadly Triad\uff09\u554f\u984c\u7684\u9ad4\u73fe\uff1a<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u6a23\u672c\u76f8\u95dc\u6027<\/strong>\uff1a\u5728\u5f37\u5316\u5b78\u7fd2\u4e2d\uff0c\u667a\u80fd\u9ad4\u6536\u96c6\u7684\u7d93\u9a57\u662f\u4e00\u500b\u6642\u9593\u4e0a\u9023\u7e8c\u7684\u5e8f\u5217\uff0c\u76f8\u9130\u7684\u6a23\u672c\u4e4b\u9593\u5177\u6709\u9ad8\u5ea6\u76f8\u95dc\u6027\u3002\u9019\u56b4\u91cd\u9055\u53cd\u4e86\u6df1\u5ea6\u5b78\u7fd2\u4e2d\u68af\u5ea6\u4e0b\u964d\u6cd5\u901a\u5e38\u6240\u4f9d\u8cf4\u7684\u300c\u7368\u7acb\u540c\u5206\u4f48\u300d\uff08I.I.D.\uff09\u7684\u6578\u64da\u5047\u8a2d \u3002\u5728\u9019\u7a2e\u76f8\u95dc\u6578\u64da\u4e0a\u8a13\u7df4\u795e\u7d93\u7db2\u8def\uff0c\u5bb9\u6613\u5c0e\u81f4\u6a21\u578b\u9677\u5165\u5c40\u90e8\u6700\u512a\u6216\u767c\u6563\u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u76ee\u6a19\u503c\u975e\u5b9a\u5e38\u6027<\/strong>\uff1aQ-learning \u7684\u66f4\u65b0\u76ee\u6a19\uff08target\uff09\u672c\u8eab\u5c31\u4f9d\u8cf4\u65bc Q-value \u7684\u4f30\u8a08\u3002\u7576\u4f7f\u7528\u4e00\u500b\u795e\u7d93\u7db2\u8def\u4f86\u4f30\u8a08 Q-value \u6642\uff0c\u7db2\u8def\u7684\u6b0a\u91cd\u5728\u6bcf\u4e00\u6b65\u8a13\u7df4\u5f8c\u90fd\u6703\u66f4\u65b0\u3002\u9019\u610f\u5473\u8457\uff0c\u7528\u65bc\u8a08\u7b97\u76ee\u6a19\u503c\u7684\u7db2\u8def\u4e5f\u5728\u4e0d\u65b7\u8b8a\u5316\uff0c\u5c0e\u81f4\u5b78\u7fd2\u76ee\u6a19\u672c\u8eab\u8655\u65bc\u4e0d\u7a69\u5b9a\u7684\u79fb\u52d5\u72c0\u614b\uff0c\u9019\u88ab\u7a31\u70ba\u300c\u79fb\u52d5\u76ee\u6a19\u554f\u984c\u300d\uff08moving target problem\uff09\u3002\u667a\u80fd\u9ad4\u5c31\u50cf\u5728\u8ffd\u9010\u4e00\u500b\u4e0d\u65b7\u79fb\u52d5\u7684\u9776\u5b50\uff0c\u96e3\u4ee5\u6536\u6582\u3002 \u00a0<\/li>\n<\/ol>\n\n\n\n<p>\u70ba\u4e86\u99b4\u670d\u9019\u7a2e\u4e0d\u7a69\u5b9a\u6027\uff0cDQN \u7684\u8a2d\u8a08\u8005\u5f15\u5165\u4e86\u5169\u500b\u81f3\u95dc\u91cd\u8981\u7684\u5de5\u7a0b\u89e3\u6c7a\u65b9\u6848\uff1a\u7d93\u9a57\u91cd\u64ad\uff08Experience Replay\uff09\u548c\u76ee\u6a19\u7db2\u8def\uff08Target Networks\uff09\u3002<\/p>\n\n\n\n<p><strong>\u7d93\u9a57\u91cd\u64ad (Experience Replay)<\/strong> \u9019\u662f\u4e00\u500b\u53d7\u751f\u7269\u5b78\u555f\u767c\u7684\u6a5f\u5236\uff0c\u5176\u6838\u5fc3\u601d\u60f3\u662f\u5efa\u7acb\u4e00\u500b\u5927\u578b\u7684\u8a18\u61b6\u9ad4\u7de9\u885d\u5340\uff08replay buffer\uff09\uff0c\u7528\u4f86\u5132\u5b58\u667a\u80fd\u9ad4\u5728\u8207\u74b0\u5883\u4e92\u52d5\u904e\u7a0b\u4e2d\u7d93\u6b77\u7684\u8f49\u63db\u5143\u7d44 <code>(s, a, r, s')<\/code>\uff0c\u5373\uff08\u72c0\u614b\u3001\u52d5\u4f5c\u3001\u734e\u52f5\u3001\u4e0b\u4e00\u72c0\u614b\uff09<sup><\/sup>\u3002\u5728\u8a13\u7df4\u968e\u6bb5\uff0c\u6f14\u7b97\u6cd5\u4e0d\u662f\u4f7f\u7528\u6700\u65b0\u7522\u751f\u7684\u55ae\u4e00\u6a23\u672c\uff0c\u800c\u662f\u5f9e\u9019\u500b\u7de9\u885d\u5340\u4e2d\u96a8\u6a5f\u63a1\u6a23\u4e00\u500b\u5c0f\u6279\u91cf\uff08mini-batch\uff09\u7684\u7d93\u9a57\u4f86\u66f4\u65b0\u795e\u7d93\u7db2\u8def\u3002\u9019\u500b\u7c21\u55ae\u7684\u904e\u7a0b\u9054\u6210\u4e86\u5169\u500b\u95dc\u9375\u76ee\u6a19\uff1a &nbsp;<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u6253\u7834\u6642\u9593\u76f8\u95dc\u6027<\/strong>\uff1a\u900f\u904e\u96a8\u6a5f\u63a1\u6a23\uff0c\u4f86\u81ea\u4e0d\u540c\u6642\u9593\u3001\u4e0d\u540c\u8ecc\u8de1\u7684\u7d93\u9a57\u88ab\u6df7\u5408\u5728\u4e00\u8d77\uff0c\u6709\u6548\u5730\u6253\u7834\u4e86\u6a23\u672c\u4e4b\u9593\u7684\u9806\u5e8f\u76f8\u95dc\u6027\u3002\u9019\u4f7f\u5f97\u8a13\u7df4\u6578\u64da\u66f4\u63a5\u8fd1 I.I.D. \u5047\u8a2d\uff0c\u5f9e\u800c\u4f7f\u795e\u7d93\u7db2\u8def\u7684\u8a13\u7df4\u66f4\u52a0\u7a69\u5b9a\u548c\u9ad8\u6548 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u63d0\u9ad8\u6a23\u672c\u6548\u7387<\/strong>\uff1a\u6bcf\u500b\u7d93\u9a57\u6a23\u672c\u90fd\u88ab\u5132\u5b58\u5728\u7de9\u885d\u5340\u4e2d\uff0c\u53ef\u4ee5\u5728\u5f8c\u7e8c\u7684\u8a13\u7df4\u4e2d\u88ab\u591a\u6b21\u91cd\u8907\u4f7f\u7528\u3002\u9019\u907f\u514d\u4e86\u50b3\u7d71\u7dda\u4e0a\u5b78\u7fd2\u4e2d\u300c\u7528\u5f8c\u5373\u68c4\u300d\u7684\u6578\u64da\u6d6a\u8cbb\uff0c\u6975\u5927\u5730\u63d0\u9ad8\u4e86\u6578\u64da\u5229\u7528\u7387\uff0c\u8b93\u667a\u80fd\u9ad4\u80fd\u5f9e\u6bcf\u4e00\u6b21\u8207\u74b0\u5883\u7684\u4e92\u52d5\u4e2d\u5b78\u5230\u66f4\u591a \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p><strong>\u76ee\u6a19\u7db2\u8def (Target Networks)<\/strong> \u70ba\u4e86\u89e3\u6c7a\u300c\u79fb\u52d5\u76ee\u6a19\u554f\u984c\u300d\uff0cDQN \u5f15\u5165\u4e86\u4e00\u500b\u7d50\u69cb\u76f8\u540c\u4f46\u53c3\u6578\u7368\u7acb\u7684\u7b2c\u4e8c\u500b\u795e\u7d93\u7db2\u8def\uff0c\u7a31\u70ba<strong>\u76ee\u6a19\u7db2\u8def<\/strong> <sup><\/sup>\u3002\u5728\u8a08\u7b97 Q-learning \u7684\u76ee\u6a19\u503c &nbsp;<\/p>\n\n\n\n<p><code>$y_i = r + \\gamma \\max_{a'} Q(s', a'; \\theta^{-})$<\/code> \u6642\uff0c\u6700\u5927\u5316\u672a\u4f86 Q-value \u7684\u64cd\u4f5c\u662f\u5728\u9019\u500b\u76ee\u6a19\u7db2\u8def\u4e0a\u9032\u884c\u7684\uff0c\u5176\u53c3\u6578\u70ba <code>$\\theta^{-}$<\/code>\u3002\u800c\u6b63\u5728\u88ab\u8a13\u7df4\u7684\u7db2\u8def\uff0c\u6211\u5011\u7a31\u4e4b\u70ba\u300c\u7dda\u4e0a\u7db2\u8def\u300d\uff08online network\uff09\uff0c\u5176\u53c3\u6578\u70ba <code>$\\theta$<\/code>\u3002<\/p>\n\n\n\n<p>\u76ee\u6a19\u7db2\u8def\u7684\u53c3\u6578 <code>$\\theta^{-}$<\/code> \u4e26\u4e0d\u96a8\u6bcf\u4e00\u6b65\u68af\u5ea6\u4e0b\u964d\u800c\u66f4\u65b0\u3002\u76f8\u53cd\uff0c\u5b83\u5011\u88ab\u300c\u51cd\u7d50\u300d\u4e00\u6bb5\u6642\u9593\uff0c\u7136\u5f8c\u5b9a\u671f\u5730\uff08\u4f8b\u5982\uff0c\u6bcf\u9694 C \u500b\u8a13\u7df4\u6b65\u9a5f\uff09\u5f9e\u7dda\u4e0a\u7db2\u8def\u4e2d\u5b8c\u6574\u8907\u88fd\u53c3\u6578 <code>$\\theta$<\/code> \u904e\u4f86\u3002\u9019\u7a2e\u5ef6\u9072\u66f4\u65b0\u7684\u6a5f\u5236\uff0c\u70ba\u7dda\u4e0a\u7db2\u8def\u7684\u5b78\u7fd2\u904e\u7a0b\u63d0\u4f9b\u4e86\u4e00\u500b\u7a69\u5b9a\u3001\u4e00\u81f4\u7684\u76ee\u6a19\u3002\u9019\u5c31\u597d\u6bd4\u5728\u512a\u5316\u904e\u7a0b\u4e2d\u8a2d\u5b9a\u4e00\u500b\u56fa\u5b9a\u7684\u53c3\u8003\u9ede\uff0c\u53ef\u4ee5\u6709\u6548\u9632\u6b62\u9707\u76ea\u548c\u767c\u6563\uff0c\u5f9e\u800c\u7a69\u5b9a\u5b78\u7fd2\u904e\u7a0b <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u7e3d\u7d50\u4f86\u8aaa\uff0cDQN \u7684\u6210\u529f\u4e26\u4e0d\u50c5\u50c5\u5728\u65bc\u4f7f\u7528\u4e86\u795e\u7d93\u7db2\u8def\uff0c\u66f4\u95dc\u9375\u7684\u662f\u5b83\u900f\u904e\u7d93\u9a57\u91cd\u64ad\u548c\u76ee\u6a19\u7db2\u8def\u9019\u5169\u500b\u5de7\u5999\u7684\u8a2d\u8a08\uff0c\u6210\u529f\u5730\u89e3\u6c7a\u4e86\u5c07\u51fd\u6578\u903c\u8fd1\u8207\u81ea\u8209\uff08bootstrapping\uff09\u7d50\u5408\u6642\u7522\u751f\u7684\u5167\u5728\u4e0d\u7a69\u5b9a\u6027\u3002\u9019\u662f\u4e00\u6b21\u7a69\u5b9a\u5316\u4e00\u500b\u672c\u8cea\u4e0a\u4e0d\u7a69\u5b9a\u5b78\u7fd2\u52d5\u614b\u7684\u5de5\u7a0b\u52dd\u5229\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">1.3 DQN \u52d5\u7269\u5712\uff1a\u4e00\u500b\u6301\u7e8c\u6539\u9032\u7684\u751f\u614b\u7cfb\u7d71<\/h3>\n\n\n\n<p>\u6700\u521d\u7684 DQN \u6f14\u7b97\u6cd5\u70ba\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u5960\u5b9a\u4e86\u5805\u5be6\u7684\u57fa\u790e\uff0c\u4f46\u5b83\u4e5f\u5b58\u5728\u4e00\u4e9b\u56fa\u6709\u7684\u554f\u984c\uff0c\u9019\u6fc0\u767c\u4e86\u7814\u7a76\u754c\u958b\u767c\u4e00\u7cfb\u5217\u6539\u9032\u7248\u672c\uff0c\u5f62\u6210\u4e86\u4e00\u500b\u88ab\u6232\u7a31\u70ba\u300cDQN \u52d5\u7269\u5712\u300d\uff08DQN Zoo\uff09\u7684\u6f14\u7b97\u6cd5\u751f\u614b\u7cfb\u7d71\u3002\u9019\u4e9b\u6539\u9032\u65e8\u5728\u5f9e\u4e0d\u540c\u89d2\u5ea6\u63d0\u5347 DQN \u7684\u6027\u80fd\u3001\u7a69\u5b9a\u6027\u548c\u6548\u7387\u3002<\/p>\n\n\n\n<p><strong>\u96d9\u91cd DQN (Double DQN, DDQN)<\/strong> \u6a19\u6e96 DQN \u7684\u4e00\u500b\u4e3b\u8981\u554f\u984c\u662f<strong>Q-value \u7684\u904e\u5ea6\u4f30\u8a08<\/strong>\uff08overestimation\uff09<sup><\/sup>\u3002\u9019\u500b\u554f\u984c\u6e90\u65bc\u76ee\u6a19\u8a08\u7b97\u4e2d\u7684 &nbsp;<\/p>\n\n\n\n<p><code>$\\max$<\/code> \u64cd\u4f5c\u3002\u7531\u65bc\u4f30\u8a08\u8aa4\u5dee\u7684\u5b58\u5728\uff0c<code>$\\max_{a'} Q(s', a')$<\/code> \u50be\u5411\u65bc\u9078\u64c7\u90a3\u4e9b\u88ab\u5076\u7136\u9ad8\u4f30\u4e86\u50f9\u503c\u7684\u52d5\u4f5c\uff0c\u5c0e\u81f4\u8a08\u7b97\u51fa\u7684\u76ee\u6a19\u503c\u7cfb\u7d71\u6027\u5730\u504f\u9ad8\u3002\u9019\u7a2e\u6b63\u5411\u504f\u5dee\u6703\u96a8\u8457\u5b78\u7fd2\u904e\u7a0b\u4e0d\u65b7\u7d2f\u7a4d\uff0c\u640d\u5bb3\u7b56\u7565\u7684\u6027\u80fd <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u70ba\u4e86\u89e3\u6c7a\u9019\u500b\u554f\u984c\uff0cHado van Hasselt \u7b49\u4eba\u63d0\u51fa\u4e86\u96d9\u91cd DQN\uff08Double DQN\uff09<sup><\/sup>\u3002\u5176\u6838\u5fc3\u601d\u60f3\u662f\u5c07 &nbsp;<\/p>\n\n\n\n<p><strong>\u52d5\u4f5c\u9078\u64c7<\/strong>\u8207<strong>\u52d5\u4f5c\u8a55\u4f30<\/strong>\u9019\u5169\u500b\u6b65\u9a5f\u89e3\u8026 <sup><\/sup>\u3002\u5177\u9ad4\u4f86\u8aaa\uff0c\u5728\u8a08\u7b97\u76ee\u6a19\u503c\u6642\uff0cDDQN \u4f7f\u7528\u7576\u524d\u7684\u7dda\u4e0a\u7db2\u8def\uff08\u53c3\u6578 &nbsp;<\/p>\n\n\n\n<p><code>$\\theta$<\/code>\uff09\u4f86\u9078\u64c7\u5728\u4e0b\u4e00\u72c0\u614b <code>s'<\/code> \u4e2d\u50f9\u503c\u6700\u9ad8\u7684\u52d5\u4f5c <code>a*<\/code>\uff0c\u5373 <code>$a^* = \\arg\\max_{a'} Q(s', a'; \\theta)$<\/code>\u3002\u7136\u5f8c\uff0c\u5b83\u4f7f\u7528\u7a69\u5b9a\u7684\u76ee\u6a19\u7db2\u8def\uff08\u53c3\u6578 <code>$\\theta^{-}$<\/code>\uff09\u4f86\u8a55\u4f30\u9019\u500b\u88ab\u9078\u4e2d\u52d5\u4f5c\u7684\u50f9\u503c\uff0c\u5373 <code>$y_i = r + \\gamma Q(s', a^*; \\theta^{-})$<\/code>\u3002\u900f\u904e\u9019\u7a2e\u65b9\u5f0f\uff0cDDQN \u907f\u514d\u4e86\u5728\u540c\u4e00\u500b\uff08\u53ef\u80fd\u5b58\u5728\u9ad8\u4f30\u504f\u5dee\u7684\uff09\u50f9\u503c\u4f30\u8a08\u96c6\u5408\u4e2d\u540c\u6642\u9032\u884c\u9078\u64c7\u548c\u8a55\u4f30\uff0c\u5f9e\u800c\u5f97\u5230\u4e86\u66f4\u6e96\u78ba\u7684\u50f9\u503c\u4f30\u8a08\uff0c\u4e26\u5728\u8a31\u591a\u904a\u6232\u4e2d\u53d6\u5f97\u4e86\u6bd4\u539f\u59cb DQN \u66f4\u597d\u7684\u6027\u80fd <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p><strong>\u6c7a\u9b25\u7db2\u8def\u67b6\u69cb (Dueling Network Architectures)<\/strong> \u53e6\u4e00\u9805\u91cd\u8981\u7684\u5275\u65b0\u662f\u6c7a\u9b25\u7db2\u8def\u67b6\u69cb\u3002\u5b83\u5c07 Q-network \u7684\u8f38\u51fa\u5c64\u5206\u89e3\u70ba\u5169\u500b\u7368\u7acb\u7684\u5206\u652f\uff08streams\uff09\uff1a\u4e00\u500b\u5206\u652f\u7528\u65bc\u4f30\u8a08\u72c0\u614b\u672c\u8eab\u7684\u50f9\u503c\u51fd\u6578\uff08state-value function\uff09<code>$V(s)$<\/code>\uff0c\u53e6\u4e00\u500b\u5206\u652f\u7528\u65bc\u4f30\u8a08\u5728\u8a72\u72c0\u614b\u4e0b\u6bcf\u500b\u52d5\u4f5c\u76f8\u5c0d\u65bc\u5e73\u5747\u52d5\u4f5c\u7684\u512a\u52e2\u51fd\u6578\uff08advantage function\uff09<code>$A(s, a)$<\/code>\u3002\u6700\u5f8c\uff0c\u9019\u5169\u500b\u5206\u652f\u7684\u8f38\u51fa\u88ab\u7d50\u5408\u8d77\u4f86\uff0c\u5f62\u6210\u6700\u7d42\u7684 Q-value \u4f30\u8a08\u3002\u9019\u7a2e\u67b6\u69cb\u7684\u512a\u52e2\u5728\u65bc\uff0c\u5b83\u80fd\u5920\u5728\u4e0d\u9700\u8981\u8a55\u4f30\u6bcf\u500b\u52d5\u4f5c\u5f71\u97ff\u7684\u60c5\u6cc1\u4e0b\uff0c\u5b78\u7fd2\u54ea\u4e9b\u72c0\u614b\u662f\u6709\u50f9\u503c\u7684\u3002\u9019\u5728\u8a31\u591a\u52d5\u4f5c\u7684\u50f9\u503c\u76f8\u8fd1\u7684\u5834\u666f\u4e2d\u5c24\u5176\u6709\u6548\uff0c\u56e0\u70ba\u7db2\u8def\u53ef\u4ee5\u66f4\u5c08\u6ce8\u65bc\u8a55\u4f30\u72c0\u614b\u7684\u50f9\u503c\uff0c\u5f9e\u800c\u5be6\u73fe\u66f4\u7a69\u5065\u7684\u7b56\u7565\u8a55\u4f30\u3002<\/p>\n\n\n\n<p><strong>\u512a\u5148\u7d93\u9a57\u91cd\u64ad (Prioritized Experience Replay, PER)<\/strong> PER \u662f\u5c0d\u6a19\u6e96\u7d93\u9a57\u91cd\u64ad\u6a5f\u5236\u7684\u91cd\u5927\u6539\u9032\uff0c\u65e8\u5728\u9032\u4e00\u6b65\u63d0\u5347\u6a23\u672c\u6548\u7387 <sup><\/sup>\u3002\u6a19\u6e96\u7684\u7d93\u9a57\u91cd\u64ad\u662f\u5f9e\u7de9\u885d\u5340\u4e2d\u5747\u52fb\u96a8\u6a5f\u5730\u63a1\u6a23\uff0c\u9019\u610f\u5473\u8457\u7121\u8ad6\u4e00\u500b\u7d93\u9a57\u591a\u9ebc\u300c\u6709\u555f\u767c\u6027\u300d\uff0c\u5b83\u88ab\u9078\u4e2d\u7684\u6a5f\u7387\u90fd\u662f\u4e00\u6a23\u7684\u3002PER \u7684\u6838\u5fc3\u601d\u60f3\u662f\uff0c\u667a\u80fd\u9ad4\u61c9\u8a72\u66f4\u983b\u7e41\u5730\u5f9e\u90a3\u4e9b\u5b83\u80fd\u5b78\u5230\u6700\u591a\u7684\u7d93\u9a57\u4e2d\u5b78\u7fd2\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5b83\u900f\u904e\u975e\u5747\u52fb\u63a1\u6a23\u4f86\u5be6\u73fe\u9019\u4e00\u9ede\uff0c\u512a\u5148\u9078\u64c7\u90a3\u4e9b\u5177\u6709\u8f03\u5927\u6642\u5e8f\u5dee\u5206\u8aa4\u5dee\uff08TD-error\uff09\u7684\u8f49\u63db\u6a23\u672c\u3002\u8f03\u5927\u7684 TD-error \u901a\u5e38\u610f\u5473\u8457\u667a\u80fd\u9ad4\u5c0d\u8a72\u72c0\u614b\u50f9\u503c\u7684\u9810\u6e2c\u8207\u5be6\u969b\u89c0\u6e2c\u5230\u7684\u56de\u5831\u4e4b\u9593\u5b58\u5728\u8f03\u5927\u5dee\u7570\uff0c\u5373\u9019\u500b\u7d93\u9a57\u5c0d\u667a\u80fd\u9ad4\u4f86\u8aaa\u662f\u300c\u4ee4\u4eba\u9a5a\u8a1d\u300d\u6216\u4fe1\u606f\u91cf\u8c50\u5bcc\u7684\u3002\u900f\u904e\u512a\u5148\u91cd\u64ad\u9019\u4e9b\u6a23\u672c\uff0cPER \u4f7f\u5f97\u5b78\u7fd2\u904e\u7a0b\u80fd\u66f4\u96c6\u4e2d\u5730\u4fee\u6b63\u932f\u8aa4\u7684\u9810\u6e2c\uff0c\u5f9e\u800c\u52a0\u901f\u6536\u6582\u4e26\u63d0\u5347\u6574\u9ad4\u6027\u80fd\u3002<\/p>\n\n\n\n<p><strong>\u5f9e\u793a\u7bc4\u4e2d\u9032\u884c\u6df1\u5ea6 Q \u5b78\u7fd2 (Deep Q-learning from Demonstrations, DQfD)<\/strong> DQfD \u662f\u4e00\u500b\u91cd\u8981\u7684\u8b8a\u9ad4\uff0c\u5b83\u70ba\u89e3\u6c7a\u5f37\u5316\u5b78\u7fd2\u65e9\u671f\u8a13\u7df4\u968e\u6bb5\u6548\u7387\u4f4e\u4e0b\u548c\u6578\u64da\u9700\u6c42\u91cf\u5927\u7684\u554f\u984c\u63d0\u4f9b\u4e86\u601d\u8def\uff0c\u540c\u6642\u4e5f\u69cb\u5efa\u4e86\u901a\u5f80\u7b2c\u56db\u90e8\u5206\u6a21\u4eff\u5b78\u7fd2\u7684\u6a4b\u6a11 <sup><\/sup>\u3002\u6a19\u6e96\u7684 DRL \u6f14\u7b97\u6cd5\u901a\u5e38\u9700\u8981\u5927\u91cf\u7684\u8a66\u932f\u624d\u80fd\u5b78\u5230\u5408\u7406\u7684\u7b56\u7565\uff0c\u9019\u5728\u6a21\u64ec\u5668\u4e2d\u5c1a\u53ef\u63a5\u53d7\uff0c\u4f46\u5728\u73fe\u5be6\u4e16\u754c\u4efb\u52d9\u4e2d\u53ef\u80fd\u4ee3\u50f9\u9ad8\u6602\u751a\u81f3\u4e0d\u53ef\u884c\u3002DQfD \u900f\u904e\u5229\u7528\u5c11\u91cf\u7684\u5c08\u5bb6\u793a\u7bc4\u6578\u64da\u4f86\u6975\u5927\u5730\u52a0\u901f\u5b78\u7fd2\u904e\u7a0b\u3002\u5b83\u5728\u8a13\u7df4\u958b\u59cb\u524d\uff0c\u5148\u7528\u5c08\u5bb6\u6578\u64da\u5c0d\u7db2\u8def\u9032\u884c\u9810\u8a13\u7df4\u3002\u5728\u96a8\u5f8c\u7684\u4e92\u52d5\u5b78\u7fd2\u968e\u6bb5\uff0c\u5b83\u5c07\u6a19\u6e96\u7684 TD \u640d\u5931\u8207\u4e00\u500b\u76e3\u7763\u640d\u5931\u76f8\u7d50\u5408\uff0c\u5f8c\u8005\u9f13\u52f5\u667a\u80fd\u9ad4\u7684 Q-value \u80fd\u5920\u4f7f\u5c08\u5bb6\u63a1\u53d6\u7684\u52d5\u4f5c\u5177\u6709\u6bd4\u5176\u4ed6\u52d5\u4f5c\u66f4\u9ad8\u7684\u50f9\u503c\u3002\u9019\u7a2e\u6df7\u5408\u65b9\u6cd5\u4f7f\u5f97\u667a\u80fd\u9ad4\u80fd\u5920\u5f9e\u4e00\u500b\u66f4\u597d\u7684\u8d77\u9ede\u958b\u59cb\u5b78\u7fd2\uff0c\u5728\u8a13\u7df4\u521d\u671f\u5c31\u8868\u73fe\u51fa\u9060\u8d85\u6a19\u6e96 DQN \u7684\u6027\u80fd\uff0c\u4e26\u80fd\u66f4\u5feb\u5730\u9054\u5230\u751a\u81f3\u8d85\u8d8a\u5c08\u5bb6\u7684\u6c34\u5e73 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u9019\u4e9b DQN \u7684\u8b8a\u9ad4\u5171\u540c\u69cb\u6210\u4e86\u4e00\u500b\u8c50\u5bcc\u7684\u5de5\u5177\u7bb1\uff0c\u5b83\u5011\u5f9e\u6e1b\u5c11\u4f30\u8a08\u504f\u5dee\u3001\u6539\u9032\u7db2\u8def\u67b6\u69cb\u3001\u512a\u5316\u6578\u64da\u63a1\u6a23\u7b56\u7565\u4ee5\u53ca\u878d\u5408\u5c08\u5bb6\u77e5\u8b58\u7b49\u591a\u500b\u7dad\u5ea6\uff0c\u7cfb\u7d71\u6027\u5730\u63d0\u5347\u4e86\u50f9\u503c\u57fa\u790e DRL \u65b9\u6cd5\u7684\u7a69\u5b9a\u6027\u8207\u6548\u7387\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u7b2c\u4e8c\u90e8\u5206\uff1a\u76f4\u63a5\u7b56\u7565\u512a\u5316 &#8211; \u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u7684\u8208\u8d77<\/h2>\n\n\n\n<p>\u7b2c\u4e00\u90e8\u5206\u63a2\u8a0e\u7684\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\uff0c\u900f\u904e\u5b78\u7fd2\u4e00\u500b\u7cbe\u78ba\u7684\u50f9\u503c\u51fd\u6578\u4f86\u9593\u63a5\u6307\u5c0e\u6c7a\u7b56\uff0c\u53d6\u5f97\u4e86\u5de8\u5927\u6210\u529f\u3002\u7136\u800c\uff0c\u9019\u7a2e\u9593\u63a5\u6027\u4e5f\u5e36\u4f86\u4e86\u56fa\u6709\u7684\u5c40\u9650\u6027\u3002\u672c\u90e8\u5206\u5c07\u95e1\u8ff0\u70ba\u4f55\u9700\u8981\u5f9e\u5b78\u7fd2\u50f9\u503c\u8f49\u5411\u76f4\u63a5\u5b78\u7fd2\u7b56\u7565\uff0c\u4e26\u8ffd\u8e64\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u5f9e\u57fa\u790e\u6982\u5ff5\u6f14\u5316\u5230\u7576\u4eca\u88ab\u5ee3\u6cdb\u4f7f\u7528\u7684 PPO \u6f14\u7b97\u6cd5\u7684\u5b8c\u6574\u8def\u5f91\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2.1 \u50f9\u503c\u57fa\u790e\u5b78\u7fd2\u7684\u5c40\u9650\u8207\u5411\u7b56\u7565\u68af\u5ea6\u7684\u8f49\u8b8a<\/h3>\n\n\n\n<p>\u5118\u7ba1 DQN \u53ca\u5176\u8b8a\u9ad4\u5728\u8655\u7406\u9ad8\u7dad\u72c0\u614b\u7a7a\u9593\u65b9\u9762\u8868\u73fe\u51fa\u8272\uff0c\u4f46\u5b83\u5011\u5728\u67d0\u4e9b\u985e\u578b\u7684\u554f\u984c\u4e0a\u537b\u986f\u5f97\u529b\u4e0d\u5f9e\u5fc3\u3002\u9019\u4e9b\u5c40\u9650\u6027\u4fc3\u4f7f\u7814\u7a76\u8005\u5011\u63a2\u7d22\u4e00\u7a2e\u66f4\u76f4\u63a5\u7684\u7b56\u7565\u5b78\u7fd2\u65b9\u6cd5 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\u7684\u4e3b\u8981\u74f6\u9838\u5305\u62ec\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u9023\u7e8c\u52d5\u4f5c\u7a7a\u9593\u7684\u6311\u6230<\/strong>\uff1aDQN \u7684\u6838\u5fc3\u64cd\u4f5c\u4e4b\u4e00\u662f\u5728\u6240\u6709\u53ef\u80fd\u7684\u52d5\u4f5c\u4e2d\u9078\u51fa Q-value \u6700\u9ad8\u7684\u52d5\u4f5c\uff08<code>$\\arg\\max_a Q(s,a)$<\/code>\uff09\u3002\u7576\u52d5\u4f5c\u7a7a\u9593\u662f\u96e2\u6563\u4e14\u6709\u9650\u6642\uff0c\u9019\u662f\u4e00\u500b\u7c21\u55ae\u7684\u6bd4\u8f03\u64cd\u4f5c\u3002\u7136\u800c\uff0c\u5728\u9023\u7e8c\u6216\u9ad8\u7dad\u5ea6\u7684\u52d5\u4f5c\u7a7a\u9593\u4e2d\uff08\u4f8b\u5982\uff0c\u6a5f\u5668\u4eba\u624b\u81c2\u95dc\u7bc0\u7684\u89d2\u5ea6\u6216\u6cb9\u9580\u7684\u529b\u5ea6\uff09\uff0c\u57f7\u884c\u9019\u500b <code>$\\max$<\/code> \u64cd\u4f5c\u672c\u8eab\u5c31\u662f\u4e00\u500b\u8907\u96dc\u7684\u3001\u96e3\u4ee5\u8655\u7406\u7684\u512a\u5316\u554f\u984c \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u7121\u6cd5\u5b78\u7fd2\u96a8\u6a5f\u7b56\u7565<\/strong>\uff1a\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\u901a\u5e38\u6703\u6536\u6582\u5230\u4e00\u500b\u78ba\u5b9a\u6027\u7b56\u7565\uff08deterministic policy\uff09\uff0c\u5373\u5728\u6bcf\u500b\u72c0\u614b\u4e0b\u7e3d\u662f\u9078\u64c7\u50f9\u503c\u6700\u9ad8\u7684\u52d5\u4f5c\u3002\u5373\u4f7f\u662f <code>$\\epsilon$<\/code>-greedy \u7b56\u7565\uff0c\u4e5f\u53ea\u662f\u5728\u78ba\u5b9a\u6027\u7b56\u7565\u7684\u57fa\u790e\u4e0a\u589e\u52a0\u4e86\u96a8\u6a5f\u63a2\u7d22\uff0c\u5176\u672c\u8cea\u4e0a\u662f\u6e96\u78ba\u5b9a\u6027\u7684\u3002\u7136\u800c\uff0c\u5728\u67d0\u4e9b\u74b0\u5883\u4e2d\uff0c\u6700\u512a\u7b56\u7565\u672c\u8eab\u5c31\u662f\u96a8\u6a5f\u7684\uff08stochastic\uff09\u3002\u4f8b\u5982\uff0c\u5728\u300c\u526a\u5200\u3001\u77f3\u982d\u3001\u5e03\u300d\u904a\u6232\u4e2d\uff0c\u4efb\u4f55\u78ba\u5b9a\u6027\u7b56\u7565\u90fd\u5f88\u5bb9\u6613\u88ab\u5c0d\u624b\u9810\u6e2c\u548c\u5229\u7528\u3002\u6b64\u5916\uff0c\u7576\u74b0\u5883\u4e2d\u5b58\u5728\u72c0\u614b\u6df7\u6dc6\uff08aliased states\uff0c\u5373\u4e0d\u540c\u7684\u771f\u5be6\u72c0\u614b\u5728\u667a\u80fd\u9ad4\u770b\u4f86\u662f\u76f8\u540c\u7684\uff09\u6642\uff0c\u96a8\u6a5f\u7b56\u7565\u4e5f\u53ef\u80fd\u662f\u5fc5\u8981\u7684 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u7b56\u7565\u7684\u4e0d\u7a69\u5b9a\u6027<\/strong>\uff1a\u5728\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\u4e2d\uff0c\u7b56\u7565\u662f\u96b1\u542b\u5730\u7531\u50f9\u503c\u51fd\u6578\u6c7a\u5b9a\u7684\u3002\u9019\u610f\u5473\u8457\uff0cQ-value \u4f30\u8a08\u7684\u4e00\u500b\u5fae\u5c0f\u8b8a\u5316\uff0c\u53ef\u80fd\u6703\u5c0e\u81f4 <code>$\\arg\\max$<\/code> \u7684\u7d50\u679c\u767c\u751f\u7a81\u8b8a\uff0c\u5f9e\u800c\u5f15\u8d77\u7b56\u7565\u7684\u5287\u70c8\u3001\u975e\u9023\u7e8c\u6027\u8b8a\u5316\u3002\u4f8b\u5982\uff0c\u67d0\u500b\u52d5\u4f5c\u7684 Q-value \u5f9e 0.22 \u8f15\u5fae\u589e\u52a0\u5230 0.23\uff0c\u5c31\u53ef\u80fd\u4f7f\u5176\u6210\u70ba\u65b0\u7684\u6700\u512a\u52d5\u4f5c\uff0c\u5c0e\u81f4\u7b56\u7565\u5f9e\u4e3b\u8981\u9078\u64c7\u300c\u5de6\u300d\u7a81\u7136\u8b8a\u70ba\u4e3b\u8981\u9078\u64c7\u300c\u53f3\u300d\u3002\u9019\u7a2e\u4e0d\u9023\u7e8c\u7684\u8b8a\u5316\u7d66\u5b78\u7fd2\u904e\u7a0b\u5e36\u4f86\u4e86\u4e0d\u7a69\u5b9a\u6027 \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u70ba\u4e86\u89e3\u6c7a\u9019\u4e9b\u554f\u984c\uff0c<strong>\u7b56\u7565\u68af\u5ea6\uff08Policy Gradient, PG\uff09\u65b9\u6cd5\u61c9\u904b\u800c\u751f\u3002\u5176\u6838\u5fc3\u601d\u60f3\u662f\u653e\u68c4\u9593\u63a5\u7684\u50f9\u503c\u5b78\u7fd2\uff0c\u8f49\u800c\u76f4\u63a5\u5c0d\u7b56\u7565\u672c\u8eab\u9032\u884c\u53c3\u6578\u5316\u548c\u512a\u5316<\/strong>\u3002\u5177\u9ad4\u4f86\u8aaa\uff0c\u6211\u5011\u7528\u4e00\u500b\u5e36\u6709\u53c3\u6578 <code>$\\theta$<\/code> \u7684\u795e\u7d93\u7db2\u8def\u4f86\u8868\u793a\u7b56\u7565 <code>$\\pi_\\theta(a|s)$<\/code>\uff0c\u9019\u500b\u7db2\u8def\u76f4\u63a5\u8f38\u51fa\u5728\u72c0\u614b <code>s<\/code> \u4e0b\u63a1\u53d6\u52d5\u4f5c <code>a<\/code> \u7684\u6a5f\u7387\uff08\u6216\u5c0d\u65bc\u9023\u7e8c\u52d5\u4f5c\uff0c\u8f38\u51fa\u4e00\u500b\u6a5f\u7387\u5206\u4f48\u7684\u53c3\u6578\uff0c\u5982\u9ad8\u65af\u5206\u4f48\u7684\u5747\u503c\u548c\u6a19\u6e96\u5dee\uff09\u3002\u5b78\u7fd2\u7684\u76ee\u6a19\u662f\u627e\u5230\u4e00\u7d44\u6700\u512a\u53c3\u6578 <code>$\\theta^*$<\/code>\uff0c\u4f7f\u5f97\u9810\u671f\u7d2f\u7a4d\u56de\u5831 <code>$J(\\theta)$<\/code> \u6700\u5927\u5316\u3002\u9019\u500b\u512a\u5316\u904e\u7a0b\u662f\u900f\u904e\u5728 <code>$J(\\theta)$<\/code> \u7684\u68af\u5ea6\u65b9\u5411\u4e0a\u9032\u884c\u68af\u5ea6\u4e0a\u5347\u4f86\u5be6\u73fe\u7684 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u6839\u64da\u7531 Sutton \u548c Barto \u7b49\u4eba\u8b49\u660e\u7684\u7b56\u7565\u68af\u5ea6\u5b9a\u7406\uff08Policy Gradient Theorem\uff09\uff0c\u76ee\u6a19\u51fd\u6578\u7684\u68af\u5ea6\u53ef\u4ee5\u8868\u793a\u70ba\u4e00\u500b\u671f\u671b\u7684\u5f62\u5f0f\uff0c\u9019\u4f7f\u5f97\u6211\u5011\u53ef\u4ee5\u900f\u904e\u63a1\u6a23\u4f86\u4f30\u8a08\u68af\u5ea6\u4e26\u66f4\u65b0\u7b56\u7565 <sup><\/sup>\u3002\u9019\u7a2e\u76f4\u63a5\u512a\u5316\u7b56\u7565\u7684\u65b9\u5f0f\uff0c\u5929\u7136\u5730\u652f\u6301\u9023\u7e8c\u52d5\u4f5c\u7a7a\u9593\u548c\u96a8\u6a5f\u7b56\u7565\uff0c\u4e26\u4e14\u7b56\u7565\u7684\u66f4\u65b0\u904e\u7a0b\u66f4\u52a0\u5e73\u6ed1\uff0c\u5f9e\u800c\u70ba\u89e3\u6c7a\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\u7684\u56fa\u6709\u96e3\u984c\u63d0\u4f9b\u4e86\u6839\u672c\u6027\u7684\u9014\u5f91\u3002 &nbsp;<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2.2 \u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\u6846\u67b6\uff1a\u4e00\u7a2e\u6df7\u5408\u65b9\u6cd5<\/h3>\n\n\n\n<p>\u5118\u7ba1\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u5728\u7406\u8ad6\u4e0a\u5177\u6709\u5438\u5f15\u529b\uff0c\u4f46\u6700\u57fa\u790e\u7684 PG \u6f14\u7b97\u6cd5\uff08\u5982 REINFORCE\uff09\u5728\u5be6\u8e10\u4e2d\u537b\u9762\u81e8\u4e00\u500b\u56b4\u5cfb\u7684\u6311\u6230\uff1a<strong>\u68af\u5ea6\u4f30\u8a08\u7684\u9ad8\u8b8a\u7570\u6578\uff08high variance\uff09<\/strong>\u3002REINFORCE \u9019\u985e\u6f14\u7b97\u6cd5\u901a\u5e38\u4f9d\u8cf4\u65bc\u5b8c\u6574\u7684\u8499\u5730\u5361\u7f85\uff08Monte Carlo\uff09\u8ecc\u8de1\u4f86\u4f30\u8a08\u56de\u5831\uff08return\uff09\uff0c\u5373\u5f9e\u67d0\u500b\u6642\u9593\u6b65\u958b\u59cb\u76f4\u5230\u6574\u500b\u56de\u5408\u7d50\u675f\u7684\u7d2f\u7a4d\u734e\u52f5\u3002\u7531\u65bc\u74b0\u5883\u7684\u96a8\u6a5f\u6027\u548c\u7b56\u7565\u7684\u96a8\u6a5f\u6027\uff0c\u5373\u4f7f\u5728\u540c\u4e00\u500b\u72c0\u614b\u4e0b\uff0c\u57f7\u884c\u4e00\u500b\u56de\u5408\u6240\u5f97\u5230\u7684\u6700\u7d42\u56de\u5831\u4e5f\u53ef\u80fd\u6703\u6709\u5de8\u5927\u7684\u6ce2\u52d5\u3002\u9019\u7a2e\u9ad8\u566a\u8072\u7684\u56de\u5831\u4f30\u8a08\u5c0e\u81f4\u8a08\u7b97\u51fa\u7684\u7b56\u7565\u68af\u5ea6\u4e5f\u5177\u6709\u5f88\u9ad8\u7684\u8b8a\u7570\u6578\uff0c\u4f7f\u5f97\u5b78\u7fd2\u904e\u7a0b\u975e\u5e38\u7de9\u6162\u4e14\u4e0d\u7a69\u5b9a <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u70ba\u4e86\u61c9\u5c0d\u9019\u4e00\u6311\u6230\uff0c**\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\uff08Actor-Critic, AC\uff09**\u6846\u67b6\u88ab\u63d0\u51fa\uff0c\u5b83\u5de7\u5999\u5730\u878d\u5408\u4e86\u7b56\u7565\u57fa\u790e\u548c\u50f9\u503c\u57fa\u790e\u65b9\u6cd5\u7684\u512a\u9ede <sup><\/sup>\u3002AC \u6846\u67b6\u5c07\u667a\u80fd\u9ad4\u5206\u89e3\u70ba\u5169\u500b\u5354\u540c\u5de5\u4f5c\u7684\u7d44\u4ef6\uff1a &nbsp;<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u884c\u52d5\u8005\uff08Actor\uff09<\/strong>\uff1a\u9019\u662f\u4e00\u500b\u7b56\u7565\u7db2\u8def\uff0c\u5176\u53c3\u6578\u70ba <code>$\\theta$<\/code>\uff0c\u5373 <code>$\\pi_\\theta(a|s)$<\/code>\u3002\u5b83\u7684\u8077\u8cac\u662f\u6839\u64da\u7576\u524d\u72c0\u614b <code>s<\/code> \u9078\u64c7\u4e00\u500b\u52d5\u4f5c <code>a<\/code>\uff0c\u76f4\u63a5\u63a7\u5236\u667a\u80fd\u9ad4\u7684\u884c\u70ba\u3002\u9019\u90e8\u5206\u7e7c\u627f\u4e86\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u7684\u512a\u9ede \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u8a55\u8ad6\u5bb6\uff08Critic\uff09<\/strong>\uff1a\u9019\u662f\u4e00\u500b\u50f9\u503c\u7db2\u8def\uff0c\u5176\u53c3\u6578\u70ba <code>$\\phi$<\/code>\u3002\u5b83\u7684\u8077\u8cac\u662f\u8a55\u4f30\u300c\u884c\u52d5\u8005\u300d\u6240\u9078\u64c7\u7684\u52d5\u4f5c\u7684\u597d\u58de\u3002\u5b83\u5b78\u7fd2\u4e00\u500b\u50f9\u503c\u51fd\u6578\uff0c\u5982\u72c0\u614b\u50f9\u503c\u51fd\u6578 <code>$V_\\phi(s)$<\/code> \u6216\u72c0\u614b-\u52d5\u4f5c\u50f9\u503c\u51fd\u6578 <code>$Q_\\phi(s,a)$<\/code> \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u9019\u5169\u500b\u7d44\u4ef6\u7684\u5354\u4f5c\u6d41\u7a0b\u5982\u4e0b\uff1a\u884c\u52d5\u8005\u6839\u64da\u5176\u7b56\u7565\u57f7\u884c\u4e00\u500b\u52d5\u4f5c\uff0c\u8a55\u8ad6\u5bb6\u5247\u89c0\u5bdf\u9019\u500b\u52d5\u4f5c\u7522\u751f\u7684\u7d50\u679c\uff0c\u4e26\u7d66\u51fa\u4e00\u500b\u8a55\u5206\u3002\u9019\u500b\u8a55\u5206\u88ab\u7528\u4f86\u6307\u5c0e\u884c\u52d5\u8005\u7684\u7b56\u7565\u66f4\u65b0\u3002\u8207\u5176\u4f7f\u7528\u5145\u6eff\u566a\u8072\u7684\u8499\u5730\u5361\u7f85\u56de\u5831\uff0cAC \u65b9\u6cd5\u4f7f\u7528\u8a55\u8ad6\u5bb6\u7684\u50f9\u503c\u4f30\u8a08\u4f86\u8a08\u7b97\u4e00\u500b\u66f4\u7a69\u5b9a\u7684\u5b78\u7fd2\u8a0a\u865f\u3002<\/p>\n\n\n\n<p>\u9019\u500b\u5b78\u7fd2\u8a0a\u865f\u901a\u5e38\u4ee5**\u512a\u52e2\u51fd\u6578\uff08Advantage Function\uff09**\u7684\u5f62\u5f0f\u51fa\u73fe\uff0c\u5b9a\u7fa9\u70ba <code>$A(s,a) = Q(s,a) - V(s)$<\/code> <sup><\/sup>\u3002\u512a\u52e2\u51fd\u6578\u7684\u76f4\u89c0\u610f\u7fa9\u662f\uff1a\u5728\u72c0\u614b &nbsp;<\/p>\n\n\n\n<p><code>s<\/code> \u4e0b\uff0c\u63a1\u53d6\u52d5\u4f5c <code>a<\/code> \u6bd4\u5e73\u5747\u60c5\u6cc1\u4e0b\uff08\u5373\u9075\u5faa\u7576\u524d\u7b56\u7565\uff09\u8981\u597d\u591a\u5c11\u3002\u5982\u679c <code>$A(s,a) &gt; 0$<\/code>\uff0c\u8aaa\u660e\u52d5\u4f5c <code>a<\/code> \u662f\u4e00\u500b\u6bd4\u5e73\u5747\u66f4\u597d\u7684\u9078\u64c7\uff0c\u7b56\u7565\u66f4\u65b0\u5c31\u6703\u589e\u52a0\u9078\u64c7 <code>a<\/code> \u7684\u6a5f\u7387\uff1b\u53cd\u4e4b\uff0c\u5982\u679c <code>$A(s,a) &lt; 0$<\/code>\uff0c\u5247\u6703\u964d\u4f4e\u9078\u64c7 <code>a<\/code> \u7684\u6a5f\u7387\u3002<\/p>\n\n\n\n<p>\u900f\u904e\u4f7f\u7528\u8a55\u8ad6\u5bb6\u63d0\u4f9b\u7684\u3001\u57fa\u65bc\u81ea\u8209\uff08bootstrapping\uff09\u7684\u50f9\u503c\u4f30\u8a08\u4f86\u8a08\u7b97\u512a\u52e2\u51fd\u6578\uff0cAC \u65b9\u6cd5\u6709\u6548\u5730\u7528\u4e00\u500b\u504f\u5dee\u8f03\u5c0f\u3001\u8b8a\u7570\u6578\u4f4e\u5f97\u591a\u7684\u5b78\u7fd2\u8a0a\u865f\u53d6\u4ee3\u4e86\u9ad8\u8b8a\u7570\u6578\u7684\u8499\u5730\u5361\u7f85\u56de\u5831\u3002\u9019\u6975\u5927\u5730\u7a69\u5b9a\u548c\u52a0\u901f\u4e86\u7b56\u7565\u7684\u5b78\u7fd2\u904e\u7a0b\uff0c\u4f7f\u5f97 AC \u6846\u67b6\u6210\u70ba\u73fe\u4ee3\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u7684\u4e3b\u6d41\u67b6\u69cb\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2.3 \u900f\u904e\u5e73\u884c\u5316\u64f4\u5c55\uff1aA2C \u8207 A3C<\/h3>\n\n\n\n<p>\u96a8\u8457\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\uff08AC\uff09\u6846\u67b6\u7684\u78ba\u7acb\uff0c\u4e0b\u4e00\u500b\u6311\u6230\u662f\u5982\u4f55\u6709\u6548\u5730\u5c07\u5176\u64f4\u5c55\u5230\u5927\u898f\u6a21\u554f\u984c\u4e2d\uff0c\u4ee5\u5229\u7528\u73fe\u4ee3\u8a08\u7b97\u786c\u9ad4\u7684\u5e73\u884c\u8655\u7406\u80fd\u529b\u3002\u5728\u9019\u4e00\u80cc\u666f\u4e0b\uff0cDeepMind \u65bc 2016 \u5e74\u63d0\u51fa\u7684**\u7570\u6b65\u512a\u52e2\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\uff08Asynchronous Advantage Actor-Critic, A3C\uff09**\u6f14\u7b97\u6cd5\u6210\u70ba\u4e86\u4e00\u500b\u91cc\u7a0b\u7891 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>A3C \u7684\u6838\u5fc3\u601d\u60f3\u662f\u5229\u7528\u591a\u500b CPU \u6838\u5fc3\u9032\u884c\u5e73\u884c\u8a08\u7b97\u3002\u5b83\u5275\u5efa\u4e86\u591a\u500b\u300c\u5de5\u4f5c\u8005\u300d\uff08worker\uff09\u667a\u80fd\u9ad4\uff0c\u6bcf\u500b\u5de5\u4f5c\u8005\u90fd\u6709\u81ea\u5df1\u7368\u7acb\u7684\u7db2\u8def\u53c3\u6578\u526f\u672c\u548c\u74b0\u5883\u526f\u672c <sup><\/sup>\u3002\u9019\u4e9b\u5de5\u4f5c\u8005\u5728\u5404\u81ea\u7684\u74b0\u5883\u4e2d\u7368\u7acb\u5730\u3001\u5e73\u884c\u5730\u6536\u96c6\u7d93\u9a57\u6578\u64da\u4e26\u8a08\u7b97\u7b56\u7565\u68af\u5ea6\u3002\u7136\u5f8c\uff0c\u5b83\u5011\u5c07\u8a08\u7b97\u51fa\u7684\u68af\u5ea6 &nbsp;<\/p>\n\n\n\n<p><strong>\u7570\u6b65\u5730<\/strong>\uff08asynchronously\uff09\u61c9\u7528\u65bc\u4e00\u500b\u4e2d\u592e\u7684\u3001\u5168\u57df\u7684\u7db2\u8def\u6a21\u578b\u4e0a <sup><\/sup>\u3002\u5b8c\u6210\u66f4\u65b0\u5f8c\uff0c\u5de5\u4f5c\u8005\u6703\u5c07\u81ea\u5df1\u7684\u672c\u5730\u7db2\u8def\u53c3\u6578\u8207\u66f4\u65b0\u5f8c\u7684\u5168\u57df\u53c3\u6578\u540c\u6b65\uff0c\u7136\u5f8c\u7e7c\u7e8c\u4e0b\u4e00\u8f2a\u7684\u6578\u64da\u6536\u96c6\u3002 &nbsp;<\/p>\n\n\n\n<p>\u9019\u7a2e\u7570\u6b65\u66f4\u65b0\u7684\u6a5f\u5236\u88ab\u8a8d\u70ba\u662f A3C \u6210\u529f\u7684\u4e00\u500b\u95dc\u9375\u56e0\u7d20\u3002\u56e0\u70ba\u6bcf\u500b\u5de5\u4f5c\u8005\u90fd\u5728\u63a2\u7d22\u74b0\u5883\u7684\u4e0d\u540c\u90e8\u5206\uff0c\u5b83\u5011\u7522\u751f\u7684\u7d93\u9a57\u6578\u64da\u6d41\u5177\u6709\u5f88\u9ad8\u7684\u591a\u6a23\u6027\u3002\u5c07\u9019\u4e9b\u591a\u6a23\u5316\u7684\u68af\u5ea6\u6d41\u6301\u7e8c\u4e0d\u65b7\u5730\u3001\u7570\u6b65\u5730\u61c9\u7528\u65bc\u5168\u57df\u6a21\u578b\uff0c\u8d77\u5230\u4e86\u4e00\u7a2e\u985e\u4f3c\u65bc\u7d93\u9a57\u91cd\u64ad\u7684\u53bb\u76f8\u95dc\u4f5c\u7528\uff0c\u6253\u7834\u4e86\u6578\u64da\u7684\u6642\u9593\u76f8\u95dc\u6027\uff0c\u5f9e\u800c\u7a69\u5b9a\u5316\u4e86\u8a13\u7df4\u904e\u7a0b <sup><\/sup>\u3002A3C \u7684\u8a2d\u8a08\u4f7f\u5176\u80fd\u5920\u9ad8\u6548\u5229\u7528\u591a\u6838 CPU\uff0c\u800c\u7121\u9700\u4f9d\u8cf4\u6602\u8cb4\u7684 GPU \u6216\u5927\u578b\u7d93\u9a57\u91cd\u64ad\u7de9\u885d\u5340 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u7136\u800c\uff0c\u5f8c\u7e8c\u7684\u7814\u7a76\u767c\u73fe\uff0cA3C \u6210\u529f\u7684\u95dc\u9375\u53ef\u80fd\u4e26\u975e\u300c\u7570\u6b65\u66f4\u65b0\u300d\u672c\u8eab\uff0c\u800c\u662f\u300c\u5e73\u884c\u6578\u64da\u6536\u96c6\u300d\u6240\u5e36\u4f86\u7684\u6578\u64da\u591a\u6a23\u6027\u3002\u9019\u50ac\u751f\u4e86<strong>\u512a\u52e2\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\uff08Advantage Actor-Critic, A2C\uff09\u7684\u51fa\u73fe\uff0c\u5b83\u672c\u8cea\u4e0a\u662f A3C \u7684\u4e00\u500b\u540c\u6b65<\/strong>\uff08synchronous\uff09\u3001\u78ba\u5b9a\u6027\u7248\u672c <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5728 A2C \u4e2d\uff0c\u540c\u6a23\u5b58\u5728\u591a\u500b\u5e73\u884c\u5de5\u4f5c\u7684\u667a\u80fd\u9ad4\u3002\u4f46\u8207 A3C \u4e0d\u540c\u7684\u662f\uff0cA2C \u5f15\u5165\u4e86\u4e00\u500b\u4e2d\u592e\u5354\u8abf\u5668\u3002\u5354\u8abf\u5668\u6703\u7b49\u5f85\u6240\u6709\u7684\u5de5\u4f5c\u8005\u90fd\u5b8c\u6210\u4e86\u4e00\u5b9a\u6578\u91cf\u7684\u74b0\u5883\u4e92\u52d5\u6b65\u9a5f\u5f8c\uff0c\u5c07\u5b83\u5011\u6536\u96c6\u5230\u7684\u6240\u6709\u7d93\u9a57\u6216\u8a08\u7b97\u51fa\u7684\u6240\u6709\u68af\u5ea6\u805a\u5408\u8d77\u4f86 <sup><\/sup>\u3002\u7136\u5f8c\uff0c\u5354\u8abf\u5668\u8a08\u7b97\u4e00\u500b\u5e73\u5747\u68af\u5ea6\uff0c\u4e26\u7528\u9019\u500b\u68af\u5ea6\u5c0d\u5168\u57df\u7db2\u8def\u9032\u884c\u4e00\u6b21\u6027\u7684\u3001\u5927\u6279\u91cf\u7684\u66f4\u65b0 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u9019\u7a2e\u540c\u6b65\u66f4\u65b0\u7684\u65b9\u5f0f\u4f7f\u5f97 A2C \u80fd\u5920\u66f4\u6709\u6548\u5730\u5229\u7528 GPU \u7684\u5e73\u884c\u8a08\u7b97\u80fd\u529b\uff0c\u56e0\u70ba GPU \u5728\u8655\u7406\u5927\u6279\u91cf\u6578\u64da\u6642\u6548\u7387\u6700\u9ad8 <sup><\/sup>\u3002\u5be6\u8e10\u8b49\u660e\uff0cA2C \u4e0d\u50c5\u5be6\u73fe\u8d77\u4f86\u6bd4 A3C \u66f4\u7c21\u55ae\uff0c\u800c\u4e14\u5728\u6027\u80fd\u4e0a\u4e5f\u5e38\u5e38\u80fd\u8207 A3C \u5339\u6575\u751a\u81f3\u8d85\u8d8a\u5f8c\u8005\u3002\u9019\u4e00\u7d50\u679c\u8868\u660e\uff0cA3C \u7684\u6027\u80fd\u63d0\u5347\u4e3b\u8981\u6b78\u529f\u65bc\u4f7f\u7528\u5e73\u884c\u74b0\u5883\u4f86\u7a69\u5b9a\u5b78\u7fd2\u904e\u7a0b\uff0c\u800c\u4e0d\u662f\u7570\u6b65\u66f4\u65b0\u6240\u5e36\u4f86\u7684\u566a\u8072\u6b63\u5247\u5316\u6548\u61c9 <sup><\/sup>\u3002A2C \u7684\u51fa\u73fe\uff0c\u70ba\u5f8c\u4f86\u7684 PPO \u7b49\u66f4\u5148\u9032\u7684\u6f14\u7b97\u6cd5\u5960\u5b9a\u4e86\u57fa\u790e\uff0c\u78ba\u7acb\u4e86\u57fa\u65bc\u5e73\u884c\u74b0\u5883\u6536\u96c6\u6578\u64da\u3001\u9032\u884c\u5927\u6279\u91cf\u540c\u6b65\u66f4\u65b0\u7684\u8a13\u7df4\u7bc4\u5f0f\u3002 &nbsp;<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">2.4 \u8fd1\u7aef\u7b56\u7565\u512a\u5316 (PPO)\uff1a\u73fe\u4ee3\u7684\u9ec3\u91d1\u6a19\u6e96<\/h3>\n\n\n\n<p>\u5118\u7ba1 A2C\/A3C \u7b49\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\u65b9\u6cd5\u5728\u7a69\u5b9a\u6027\u4e0a\u53d6\u5f97\u4e86\u986f\u8457\u9032\u6b65\uff0c\u4f46\u6a19\u6e96\u7684\u7b56\u7565\u68af\u5ea6\u66f4\u65b0\u4ecd\u7136\u5b58\u5728\u4e00\u500b\u6839\u672c\u6027\u7684\u98a8\u96aa\uff1a<strong>\u66f4\u65b0\u6b65\u9577\uff08step size\uff09\u7684\u9078\u64c7\u6975\u5176\u654f\u611f<\/strong>\u3002\u5982\u679c\u5b78\u7fd2\u7387\u8a2d\u7f6e\u5f97\u904e\u5927\uff0c\u4e00\u6b21\u7cdf\u7cd5\u7684\u66f4\u65b0\u5c31\u53ef\u80fd\u5c0e\u81f4\u7b56\u7565\u6027\u80fd\u7684\u707d\u96e3\u6027\u5d29\u6f70\uff0c\u9019\u7a2e\u300c\u7834\u58de\u6027\u7684\u5927\u5e45\u5ea6\u7b56\u7565\u66f4\u65b0\u300d\u6703\u8b93\u667a\u80fd\u9ad4\u4e4b\u524d\u7684\u5b78\u7fd2\u6210\u679c\u6bc0\u65bc\u4e00\u65e6\uff0c\u4e14\u96e3\u4ee5\u6062\u5fa9 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u70ba\u4e86\u89e3\u6c7a\u9019\u500b\u554f\u984c\uff0c\u7814\u7a76\u8005\u5011\u63d0\u51fa\u4e86\u4fe1\u4efb\u5340\u57df\u7b56\u7565\u512a\u5316\uff08Trust Region Policy Optimization, TRPO\uff09\u3002TRPO \u900f\u904e\u5728\u6bcf\u6b21\u66f4\u65b0\u6642\u65bd\u52a0\u4e00\u500b\u7d04\u675f\uff0c\u78ba\u4fdd\u65b0\u7b56\u7565\u8207\u820a\u7b56\u7565\u4e4b\u9593\u7684\u5dee\u7570\uff08\u901a\u5e38\u7528 KL \u6563\u5ea6\u8861\u91cf\uff09\u4e0d\u8d85\u904e\u4e00\u500b\u5c0f\u7684\u300c\u4fe1\u4efb\u5340\u57df\u300d\uff0c\u5f9e\u800c\u4fdd\u8b49\u4e86\u7b56\u7565\u7684\u55ae\u8abf\u6539\u9032\u3002\u7136\u800c\uff0cTRPO \u7684\u7d04\u675f\u662f\u4e00\u500b\u4e8c\u968e\u512a\u5316\u554f\u984c\uff0c\u8a08\u7b97\u8907\u96dc\u4e14\u96e3\u4ee5\u8207\u73fe\u4ee3\u6df1\u5ea6\u5b78\u7fd2\u6846\u67b6\u4e2d\u5e38\u7528\u7684\u96a8\u6a5f\u68af\u5ea6\u4e0b\u964d\uff08SGD\uff09\u53ca\u5176\u8b8a\u9ad4\uff08\u5982 Adam\uff09\u517c\u5bb9 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>**\u8fd1\u7aef\u7b56\u7565\u512a\u5316\uff08Proximal Policy Optimization, PPO\uff09**\u61c9\u904b\u800c\u751f\uff0c\u5176\u76ee\u6a19\u662f\u5728\u4e0d\u72a7\u7272 TRPO \u7a69\u5b9a\u6027\u7684\u524d\u63d0\u4e0b\uff0c\u63d0\u4f9b\u4e00\u7a2e\u5be6\u73fe\u66f4\u7c21\u55ae\u3001\u8a08\u7b97\u6548\u7387\u66f4\u9ad8\u7684\u4e00\u968e\u512a\u5316\u6f14\u7b97\u6cd5 <sup><\/sup>\u3002PPO \u6191\u85c9\u5176\u51fa\u8272\u7684\u6027\u80fd\u548c\u6613\u7528\u6027\uff0c\u8fc5\u901f\u6210\u70ba\u4e86\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u9818\u57df\u7684\u300c\u9ec3\u91d1\u6a19\u6e96\u300d\u4e4b\u4e00\u3002 &nbsp;<\/p>\n\n\n\n<p>PPO \u7684\u6838\u5fc3\u5275\u65b0\u5728\u65bc\u5176\u7368\u7279\u7684<strong>\u88c1\u526a\u4ee3\u7406\u76ee\u6a19\u51fd\u6578\uff08Clipped Surrogate Objective Function\uff09<\/strong><sup><\/sup>\u3002\u9019\u500b\u76ee\u6a19\u51fd\u6578\u7684\u8a2d\u8a08\u5de7\u5999\u5730\u5728\u76ee\u6a19\u51fd\u6578\u5167\u90e8\u5be6\u73fe\u4e86\u985e\u4f3c\u65bc\u4fe1\u4efb\u5340\u57df\u7684\u7d04\u675f\u6548\u679c\u3002\u8b93\u6211\u5011\u8a73\u7d30\u5206\u89e3\u5176\u69cb\u6210\uff1a &nbsp;<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u6a5f\u7387\u6bd4\u7387\uff08Probability Ratio\uff09<\/strong>\uff1aPPO \u9996\u5148\u8a08\u7b97\u65b0\u7b56\u7565 <code>$\\pi_\\theta(a_t|s_t)$<\/code> \u8207\u7522\u751f\u6578\u64da\u7684\u820a\u7b56\u7565 <code>$\\pi_{\\theta_{old}}(a_t|s_t)$<\/code> \u4e4b\u9593\u5c0d\u540c\u4e00\u500b\u52d5\u4f5c\u7684\u6a5f\u7387\u6bd4\uff1a <code>$$r_t(\\theta) = \\frac{\\pi_\\theta(a_t|s_t)}{\\pi_{\\theta_{old}}(a_t|s_t)}$$<\/code> \u9019\u500b\u6bd4\u7387 <code>$r_t(\\theta)$<\/code> \u8861\u91cf\u4e86\u7b56\u7565\u66f4\u65b0\u7684\u5e45\u5ea6\u3002\u5982\u679c <code>$r_t > 1$<\/code>\uff0c\u8868\u793a\u65b0\u7b56\u7565\u66f4\u50be\u5411\u65bc\u63a1\u53d6\u8a72\u52d5\u4f5c\uff1b\u5982\u679c <code>$r_t &lt; 1$<\/code>\uff0c\u5247\u8868\u793a\u50be\u5411\u6027\u964d\u4f4e \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u4ee3\u7406\u76ee\u6a19\uff08Surrogate Objective\uff09<\/strong>\uff1a\u6a19\u6e96\u7684\u7b56\u7565\u68af\u5ea6\u76ee\u6a19\u88ab\u66ff\u63db\u70ba <code>$L(\\theta) = r_t(\\theta) \\cdot \\hat{A}_t$<\/code>\uff0c\u5176\u4e2d <code>$\\hat{A}_t$<\/code> \u662f\u5728\u6642\u9593\u6b65 <code>t<\/code> \u7684\u512a\u52e2\u51fd\u6578\u4f30\u8a08\u3002<\/li>\n\n\n\n<li><strong>\u88c1\u526a\u6a5f\u5236\uff08Clipping Mechanism\uff09<\/strong>\uff1a\u9019\u662f PPO \u7684\u7cbe\u9ad3\u6240\u5728\u3002\u70ba\u4e86\u9632\u6b62 <code>$r_t(\\theta)$<\/code> \u8b8a\u5f97\u904e\u5927\u6216\u904e\u5c0f\uff08\u9019\u6703\u5c0e\u81f4\u5287\u70c8\u7684\u7b56\u7565\u66f4\u65b0\uff09\uff0cPPO \u5f15\u5165\u4e86\u4e00\u500b\u88c1\u526a\u64cd\u4f5c\u3002\u5b83\u5c07 <code>$r_t(\\theta)$<\/code> \u7684\u503c\u9650\u5236\u5728\u4e00\u500b\u5c0f\u7684\u5340\u9593 <code>$[1-\\epsilon, 1+\\epsilon]$<\/code> \u5167\uff0c\u5176\u4e2d <code>$\\epsilon$<\/code> \u662f\u4e00\u500b\u8d85\u53c3\u6578\uff08\u901a\u5e38\u53d6 0.1 \u6216 0.2\uff09\u3002\u9019\u7522\u751f\u4e86\u76ee\u6a19\u51fd\u6578\u7684\u7b2c\u4e8c\u500b\u90e8\u5206\uff1a<code>$\\text{clip}(r_t(\\theta), 1-\\epsilon, 1+\\epsilon) \\cdot \\hat{A}_t$<\/code> \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u6700\u7d42\u76ee\u6a19\u51fd\u6578<\/strong>\uff1aPPO \u6700\u7d42\u7684\u76ee\u6a19\u51fd\u6578\u662f\u53d6\u4e0a\u8ff0\u5169\u500b\u90e8\u5206\u4e2d\u7684<strong>\u6700\u5c0f\u503c<\/strong>\uff1a <code>$$L^{CLIP}(\\theta) = \\mathbb{E}_t \\left[ \\min\\left( r_t(\\theta)\\hat{A}_t, \\text{clip}(r_t(\\theta), 1-\\epsilon, 1+\\epsilon)\\hat{A}_t \\right) \\right]$$<\/code> \u9019\u500b <code>$\\min$<\/code> \u64cd\u4f5c\u662f\u95dc\u9375\u3002\u7576\u512a\u52e2 <code>$\\hat{A}_t$<\/code> \u70ba\u6b63\u6642\uff08\u5373\u52d5\u4f5c\u512a\u65bc\u5e73\u5747\uff09\uff0c\u76ee\u6a19\u51fd\u6578\u8b8a\u70ba <code>$\\min(r_t(\\theta)\\hat{A}_t, (1+\\epsilon)\\hat{A}_t)$<\/code>\u3002\u9019\u610f\u5473\u8457\u5373\u4f7f <code>$r_t(\\theta)$<\/code> \u8b8a\u5f97\u5f88\u5927\uff0c\u7b56\u7565\u66f4\u65b0\u7684\u5e45\u5ea6\u4e5f\u6703\u88ab <code>$(1+\\epsilon)$<\/code> \u6240\u9650\u5236\uff0c\u5f9e\u800c\u963b\u6b62\u4e86\u904e\u65bc\u6fc0\u9032\u7684\u66f4\u65b0\u3002\u7576\u512a\u52e2\u70ba\u8ca0\u6642\uff0c\u540c\u7406\u4e5f\u6703\u9650\u5236\u66f4\u65b0\u7684\u5e45\u5ea6\u3002<\/li>\n<\/ol>\n\n\n\n<p>\u9019\u7a2e\u8a2d\u8a08\u5275\u5efa\u4e86\u4e00\u500b\u5c0d\u7b56\u7565\u6539\u9032\u7684\u300c\u60b2\u89c0\u4e0b\u754c\u300d\uff08pessimistic lower bound\uff09\uff0c\u6709\u6548\u5730\u61f2\u7f70\u4e86\u90a3\u4e9b\u8a66\u5716\u504f\u96e2\u820a\u7b56\u7565\u592a\u9060\u7684\u66f4\u65b0\uff0c\u78ba\u4fdd\u4e86\u8a13\u7df4\u7684\u7a69\u5b9a\u6027 <sup><\/sup>\u3002PPO \u9019\u7a2e\u7c21\u55ae\u800c\u512a\u96c5\u7684\u88c1\u526a\u6a5f\u5236\uff0c\u4f7f\u5176\u80fd\u5920\u5728\u540c\u4e00\u6279\u6578\u64da\u4e0a\u5b89\u5168\u5730\u9032\u884c\u591a\u8f2a\uff08epochs\uff09\u7684\u5c0f\u6279\u91cf\uff08minibatch\uff09\u66f4\u65b0\uff0c\u9019\u6975\u5927\u5730\u63d0\u9ad8\u4e86\u6a23\u672c\u8907\u96dc\u5ea6\uff08sample complexity\uff09\uff0c\u4f7f\u5176\u5728\u6027\u80fd\u3001\u5be6\u73fe\u7c21\u55ae\u6027\u548c\u7a69\u5b9a\u6027\u4e4b\u9593\u9054\u5230\u4e86\u7d55\u4f73\u7684\u5e73\u8861 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5f9e DQN \u5230 PPO \u7684\u6f14\u5316\uff0c\u53cd\u6620\u4e86 DRL \u9818\u57df\u5728\u99b4\u670d\u4e0d\u7a69\u5b9a\u6027\u65b9\u9762\u7684\u6301\u7e8c\u9032\u6b65\uff1a\u9996\u5148\u662f\u7a69\u5b9a\u50f9\u503c\u51fd\u6578\u672c\u8eab\uff08DQN\uff09\uff0c\u7136\u5f8c\u662f\u7a69\u5b9a\u9ad8\u8b8a\u7570\u6578\u7684\u7b56\u7565\u68af\u5ea6\uff08AC\uff09\uff0c\u6700\u7d42\u662f\u7a69\u5b9a\u7b56\u7565\u66f4\u65b0\u7684\u6b65\u9577\uff08PPO\uff09\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u7b2c\u4e09\u90e8\u5206\uff1a\u5b78\u7fd2\u5167\u90e8\u4e16\u754c &#8211; \u6a21\u578b\u57fa\u790e\u7684\u524d\u6cbf<\/h2>\n\n\n\n<p>\u524d\u9762\u63a2\u8a0e\u7684\u50f9\u503c\u57fa\u790e\u548c\u7b56\u7565\u57fa\u790e\u65b9\u6cd5\u90fd\u5c6c\u65bc\u300c\u7121\u6a21\u578b\u300d\uff08model-free\uff09\u5f37\u5316\u5b78\u7fd2\u7684\u7bc4\u7587\uff0c\u5b83\u5011\u76f4\u63a5\u5f9e\u8207\u74b0\u5883\u7684\u4e92\u52d5\u7d93\u9a57\u4e2d\u5b78\u7fd2\u50f9\u503c\u51fd\u6578\u6216\u7b56\u7565\uff0c\u800c\u7121\u9700\u7406\u89e3\u74b0\u5883\u7684\u5167\u90e8\u904b\u4f5c\u6a5f\u5236\u3002\u672c\u90e8\u5206\u5c07\u63a2\u7d22\u5f37\u5316\u5b78\u7fd2\u6f14\u5316\u6a39\u7684\u53e6\u4e00\u500b\u91cd\u8981\u5206\u652f\uff1a\u300c\u6a21\u578b\u57fa\u790e\u300d\uff08model-based\uff09RL\u3002\u9019\u4e9b\u65b9\u6cd5\u4e0d\u76f4\u63a5\u5b78\u7fd2\u5982\u4f55\u884c\u52d5\uff0c\u800c\u662f\u9996\u5148\u5b78\u7fd2\u4e00\u500b\u74b0\u5883\u672c\u8eab\u7684\u52d5\u614b\u6a21\u578b\uff0c\u9019\u4e00\u7b56\u7565\u4e0a\u7684\u8f49\u8b8a\uff0c\u70ba\u89e3\u6c7a\u6a23\u672c\u6548\u7387\u554f\u984c\u5e36\u4f86\u4e86\u5de8\u5927\u7684\u6f5b\u529b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3.1 \u6a21\u578b\u57fa\u790e\u8207\u7121\u6a21\u578b\u7684\u4e8c\u5206\u6cd5<\/h3>\n\n\n\n<p>\u5728\u5f37\u5316\u5b78\u7fd2\u4e2d\uff0c\u6839\u64da\u667a\u80fd\u9ad4\u662f\u5426\u5b78\u7fd2\u74b0\u5883\u7684\u52d5\u614b\u6a21\u578b\uff0c\u53ef\u4ee5\u5c07\u6f14\u7b97\u6cd5\u5206\u70ba\u5169\u5927\u985e <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u7121\u6a21\u578b\u5f37\u5316\u5b78\u7fd2 (Model-Free RL)<\/strong>\uff1a\u5982\u524d\u8ff0\u7684 DQN \u548c PPO\uff0c\u9019\u985e\u6f14\u7b97\u6cd5\u7684\u667a\u80fd\u9ad4\u5c07\u74b0\u5883\u8996\u70ba\u4e00\u500b\u300c\u9ed1\u76d2\u5b50\u300d\u3002\u5b83\u4e0d\u8a66\u5716\u53bb\u7406\u89e3\u72c0\u614b\u662f\u5982\u4f55\u8f49\u63db\u7684\uff0c\u6216\u8005\u734e\u52f5\u662f\u5982\u4f55\u751f\u6210\u7684\u3002\u76f8\u53cd\uff0c\u5b83\u5b8c\u5168\u900f\u904e\u8a66\u932f\uff08trial-and-error\uff09\u7684\u65b9\u5f0f\uff0c\u76f4\u63a5\u5f9e\u7d93\u9a57\u5143\u7d44 <code>$(s, a, r, s')$<\/code> \u4e2d\u5b78\u7fd2\u4e00\u500b\u50f9\u503c\u51fd\u6578\u6216\u4e00\u500b\u7b56\u7565 \u3002\u5176\u5b78\u7fd2\u7684\u6838\u5fc3\u662f\u56de\u7b54\u300c\u5728\u72c0\u614b \u00a0 <code>s<\/code> \u505a\u4ec0\u9ebc\u52d5\u4f5c <code>a<\/code> \u597d\uff1f\u300d\u9019\u500b\u554f\u984c\u3002<\/li>\n\n\n\n<li><strong>\u6a21\u578b\u57fa\u790e\u5f37\u5316\u5b78\u7fd2 (Model-Based RL)<\/strong>\uff1a\u9019\u985e\u6f14\u7b97\u6cd5\u7684\u667a\u80fd\u9ad4\u5247\u63a1\u53d6\u4e00\u7a2e\u66f4\u9593\u63a5\u7684\u65b9\u5f0f\u3002\u5b83\u9996\u5148\u81f4\u529b\u65bc\u5b78\u7fd2\u4e00\u500b\u74b0\u5883\u7684<strong>\u6a21\u578b<\/strong>\uff0c\u9019\u500b\u6a21\u578b\u901a\u5e38\u7528\u4f86\u9810\u6e2c\u5728\u7d66\u5b9a\u7576\u524d\u72c0\u614b <code>s<\/code> \u548c\u52d5\u4f5c <code>a<\/code> \u7684\u60c5\u6cc1\u4e0b\uff0c\u4e0b\u4e00\u500b\u72c0\u614b <code>s'<\/code> \u548c\u734e\u52f5 <code>r<\/code> \u7684\u6a5f\u7387\u5206\u4f48\uff0c\u5373 <code>$p(s', r | s, a)$<\/code> \u3002\u4e00\u65e6\u5b78\u5230\u4e86\u9019\u500b\u6a21\u578b\uff0c\u667a\u80fd\u9ad4\u5c31\u53ef\u4ee5\u5229\u7528\u5b83\u4f86\u9032\u884c\u300c\u898f\u5283\u300d\uff08planning\uff09\u3002\u4f8b\u5982\uff0c\u5b83\u53ef\u4ee5\u5728\u5167\u90e8\u300c\u60f3\u50cf\u300d\u6216\u300c\u6a21\u64ec\u300d\u57f7\u884c\u4e00\u7cfb\u5217\u52d5\u4f5c\u5f8c\u53ef\u80fd\u7522\u751f\u7684\u5f8c\u679c\uff0c\u800c\u7121\u9700\u8207\u771f\u5be6\u74b0\u5883\u9032\u884c\u4e92\u52d5\uff0c\u5f9e\u800c\u627e\u5230\u4e00\u500b\u6700\u512a\u7684\u884c\u52d5\u8a08\u5283 \u3002\u5176\u5b78\u7fd2\u7684\u6838\u5fc3\u662f\u56de\u7b54\u300c\u5982\u679c\u6211\u5728\u72c0\u614b \u00a0 <code>s<\/code> \u505a\u52d5\u4f5c <code>a<\/code>\uff0c\u4e16\u754c\u6703\u8b8a\u6210\u4ec0\u9ebc\u6a23\uff1f\u300d\u9019\u500b\u554f\u984c\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u9019\u5169\u7a2e\u7bc4\u5f0f\u4e4b\u9593\u5b58\u5728\u4e00\u500b\u6838\u5fc3\u7684\u6b0a\u8861\uff0c\u4e3b\u8981\u9ad4\u73fe\u5728<strong>\u6a23\u672c\u6548\u7387<\/strong>\u548c<strong>\u6f38\u9032\u6027\u80fd<\/strong>\u4e0a\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u6a23\u672c\u6548\u7387 (Sample Efficiency)<\/strong>\uff1a\u6a21\u578b\u57fa\u790e\u65b9\u6cd5\u901a\u5e38\u5177\u6709\u986f\u8457\u66f4\u9ad8\u7684\u6a23\u672c\u6548\u7387 \u3002\u539f\u56e0\u5728\u65bc\uff0c\u4e00\u65e6\u5b78\u6703\u4e86\u74b0\u5883\u6a21\u578b\uff0c\u667a\u80fd\u9ad4\u5c31\u53ef\u4ee5\u5229\u7528\u9019\u500b\u6a21\u578b\u751f\u6210\u5927\u91cf\u7684\u6a21\u64ec\u6578\u64da\u6216\u300c\u60f3\u50cf\u7684\u7d93\u9a57\u300d\uff08imagined experience\uff09\uff0c\u5f9e\u800c\u6975\u5927\u5730\u6e1b\u5c11\u4e86\u8207\u771f\u5be6\u4e16\u754c\u4e92\u52d5\u7684\u9700\u6c42 \u3002\u9019\u5728\u90a3\u4e9b\u771f\u5be6\u4e16\u754c\u6a23\u672c\u6536\u96c6\u6210\u672c\u9ad8\u6602\uff08\u5982\u91d1\u878d\u4ea4\u6613\uff09\u3001\u8017\u6642\u9577\uff08\u5982\u85e5\u7269\u7814\u767c\uff09\u6216\u5b58\u5728\u5371\u96aa\uff08\u5982\u6a5f\u5668\u4eba\u63a7\u5236\u3001\u81ea\u52d5\u99d5\u99db\uff09\u7684\u9818\u57df\u4e2d\u81f3\u95dc\u91cd\u8981 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u6f38\u9032\u6027\u80fd (Asymptotic Performance) \u8207\u6a21\u578b\u504f\u5dee (Model Bias)<\/strong>\uff1a\u5118\u7ba1\u6a23\u672c\u6548\u7387\u9ad8\uff0c\u4f46\u6a21\u578b\u57fa\u790e\u65b9\u6cd5\u7684\u6700\u7d42\u6027\u80fd\u4e0a\u9650\u537b\u53d7\u5236\u65bc\u5176\u6240\u5b78\u6a21\u578b\u7684\u6e96\u78ba\u6027\u3002\u5982\u679c\u5b78\u7fd2\u5230\u7684\u6a21\u578b\u8207\u771f\u5be6\u74b0\u5883\u5b58\u5728\u504f\u5dee\uff08\u5373\u300c\u6a21\u578b\u504f\u5dee\u300d\uff09\uff0c\u90a3\u9ebc\u57fa\u65bc\u9019\u500b\u6709\u7f3a\u9677\u7684\u6a21\u578b\u898f\u5283\u51fa\u7684\u7b56\u7565\u4e5f\u5c07\u662f\u6b21\u512a\u7684 \u3002\u76f8\u6bd4\u4e4b\u4e0b\uff0c\u7121\u6a21\u578b\u65b9\u6cd5\u76f4\u63a5\u5f9e\u771f\u5be6\u74b0\u5883\u4e2d\u5b78\u7fd2\uff0c\u4e0d\u53d7\u6a21\u578b\u504f\u5dee\u7684\u9650\u5236\uff0c\u56e0\u6b64\u5728\u6709\u8db3\u5920\u591a\u7684\u6578\u64da\u9032\u884c\u8a13\u7df4\u6642\uff0c\u5b83\u5011\u5f80\u5f80\u80fd\u9054\u5230\u66f4\u9ad8\u7684\u6700\u7d42\uff08\u6f38\u9032\uff09\u6027\u80fd \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u9019\u7a2e\u4e8c\u5206\u6cd5\u4ee3\u8868\u4e86\u5f37\u5316\u5b78\u7fd2\u4e2d\u5169\u7a2e\u622a\u7136\u4e0d\u540c\u7684\u5b78\u7fd2\u54f2\u5b78\uff1a\u662f\u76f4\u63a5\u5b78\u7fd2\u300c\u600e\u9ebc\u505a\u300d\uff08\u7121\u6a21\u578b\uff09\uff0c\u9084\u662f\u5148\u5b78\u7fd2\u300c\u4e16\u754c\u5982\u4f55\u904b\u4f5c\u300d\u7136\u5f8c\u518d\u6c7a\u5b9a\u300c\u600e\u9ebc\u505a\u300d\uff08\u6a21\u578b\u57fa\u790e\uff09\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3.2 \u4e16\u754c\u6a21\u578b\uff1a\u5728\u6f5b\u5728\u7a7a\u9593\u4e2d\u5b78\u7fd2\u505a\u5922<\/h3>\n\n\n\n<p>\u6a21\u578b\u57fa\u790e\u5f37\u5316\u5b78\u7fd2\u7684\u7406\u5ff5\u96d6\u7136\u8a98\u4eba\uff0c\u4f46\u9577\u671f\u4ee5\u4f86\u9762\u81e8\u4e00\u500b\u5de8\u5927\u6311\u6230\uff1a\u70ba\u8907\u96dc\u7684\u9ad8\u7dad\u74b0\u5883\uff08\u5982\u5f9e\u50cf\u7d20\u8f38\u5165\u7684\u904a\u6232\u4e16\u754c\uff09\u5b78\u7fd2\u4e00\u500b\u6e96\u78ba\u7684\u52d5\u614b\u6a21\u578b\u6975\u5176\u56f0\u96e3\u3002\u76f4\u63a5\u9810\u6e2c\u4e0b\u4e00\u5e40\u5716\u50cf\u4e2d\u7684\u6bcf\u4e00\u500b\u50cf\u7d20\u4e0d\u50c5\u8a08\u7b97\u91cf\u5de8\u5927\uff0c\u800c\u4e14\u5fae\u5c0f\u7684\u9810\u6e2c\u8aa4\u5dee\u6703\u96a8\u8457\u6642\u9593\u8fc5\u901f\u7d2f\u7a4d\uff0c\u5c0e\u81f4\u6a21\u64ec\u51fa\u7684\u8ecc\u8de1\u5f88\u5feb\u8207\u73fe\u5be6\u812b\u7bc0\u3002<\/p>\n\n\n\n<p>2018 \u5e74\uff0cDavid Ha \u548c J\u00fcrgen Schmidhuber \u63d0\u51fa\u7684\u300c\u4e16\u754c\u6a21\u578b\u300d\uff08World Models\uff09\u8ad6\u6587\u70ba\u9019\u4e00\u96e3\u984c\u63d0\u4f9b\u4e86\u4e00\u500b\u9769\u547d\u6027\u7684\u89e3\u6c7a\u65b9\u6848 <sup><\/sup>\u3002\u5176\u6838\u5fc3\u601d\u60f3\u6df1\u53d7\u4eba\u985e\u8a8d\u77e5\u7cfb\u7d71\u7684\u555f\u767c\uff1a\u6211\u5011\u4e26\u4e0d\u5728\u5927\u8166\u4e2d\u5c0d\u4e16\u754c\u7684\u6bcf\u4e00\u500b\u7d30\u7bc0\u9032\u884c\u6a21\u64ec\uff0c\u800c\u662f\u5efa\u7acb\u4e00\u500b\u62bd\u8c61\u7684\u3001\u58d3\u7e2e\u7684\u5167\u90e8\u5fc3\u667a\u6a21\u578b\uff0c\u4e26\u57fa\u65bc\u9019\u500b\u6a21\u578b\u9032\u884c\u9810\u6e2c\u548c\u6c7a\u7b56 <sup><\/sup>\u3002\u4e16\u754c\u6a21\u578b\u7684\u76ee\u6a19\u6b63\u662f\u5c07\u8907\u96dc\u7684 &nbsp;<\/p>\n\n\n\n<p><strong>\u611f\u77e5\/\u5efa\u6a21\u554f\u984c<\/strong>\u8207\u76f8\u5c0d\u7c21\u55ae\u7684<strong>\u63a7\u5236\u554f\u984c<\/strong>\u5206\u96e2\u958b\u4f86 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u4e16\u754c\u6a21\u578b\u67b6\u69cb\u7531\u4e09\u500b\u6838\u5fc3\u7d44\u4ef6\u69cb\u6210\uff0c\u5206\u5225\u662f V\u3001M \u548c C <sup><\/sup>\uff1a &nbsp;<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u8996\u89ba\u6a21\u578b (Vision Model, V)<\/strong>\uff1a\u9019\u662f\u4e00\u500b\u8b8a\u5206\u81ea\u52d5\u7de8\u78bc\u5668\uff08Variational Autoencoder, VAE\uff09\u3002\u5b83\u7684\u4efb\u52d9\u662f\u5728\u7121\u76e3\u7763\u7684\u60c5\u6cc1\u4e0b\uff0c\u5c07\u5f9e\u74b0\u5883\u4e2d\u89c0\u6e2c\u5230\u7684\u9ad8\u7dad\u539f\u59cb\u8f38\u5165\uff08\u5982 <code>64x64x3<\/code> \u7684\u50cf\u7d20\u5716\u50cf\uff09\u58d3\u7e2e\u6210\u4e00\u500b\u4f4e\u7dad\u7684<strong>\u6f5b\u5728\u5411\u91cf\uff08latent vector\uff09<code>$z$<\/code><\/strong>\u3002\u9019\u500b\u6f5b\u5728\u5411\u91cf <code>$z$<\/code> \u6355\u6349\u4e86\u89c0\u6e2c\u5716\u50cf\u7684\u7a7a\u9593\u7cbe\u83ef\uff0c\u5f62\u6210\u4e86\u4e00\u500b\u95dc\u65bc\u4e16\u754c\u7684\u58d3\u7e2e\u7a7a\u9593\u8868\u5fb5\u3002\u4f8b\u5982\uff0c\u5b83\u53ef\u4ee5\u5c07\u8907\u96dc\u7684\u8cfd\u9053\u756b\u9762\u58d3\u7e2e\u6210\u4e00\u500b\u50c5 32 \u7dad\u7684\u5411\u91cf \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u8a18\u61b6\u6a21\u578b (Memory Model, M)<\/strong>\uff1a\u9019\u662f\u4e00\u500b\u5faa\u74b0\u795e\u7d93\u7db2\u8def\uff08Recurrent Neural Network, RNN\uff09\uff0c\u901a\u5e38\u8207\u6df7\u5408\u5bc6\u5ea6\u7db2\u8def\uff08Mixture Density Network, MDN\uff09\u7d50\u5408\u4f7f\u7528\u3002V \u6a21\u578b\u8ca0\u8cac\u58d3\u7e2e\u7a7a\u9593\u8cc7\u8a0a\uff0c\u800c M \u6a21\u578b\u5247\u8ca0\u8cac\u5b78\u7fd2\u548c\u58d3\u7e2e<strong>\u6642\u9593\u52d5\u614b<\/strong>\u3002\u5b83\u5728\u4f4e\u7dad\u7684\u6f5b\u5728\u7a7a\u9593\u4e2d\u904b\u4f5c\uff0c\u5176\u76ee\u6a19\u662f\u5b78\u7fd2\u9810\u6e2c<strong>\u4e0b\u4e00\u500b<\/strong>\u6f5b\u5728\u72c0\u614b <code>$z_{t+1}$<\/code> \u7684\u6a5f\u7387\u5206\u4f48\uff0c\u5373 <code>$p(z_{t+1} | z_t, a_t, h_t)$<\/code>\uff0c\u5176\u4e2d <code>$a_t$<\/code> \u662f\u7576\u524d\u52d5\u4f5c\uff0c<code>$h_t$<\/code> \u662f RNN \u7684\u96b1\u85cf\u72c0\u614b \u3002\u7531\u65bc\u74b0\u5883\u53ef\u80fd\u662f\u96a8\u6a5f\u7684\uff0cM \u6a21\u578b\u8f38\u51fa\u7684\u662f\u4e00\u500b\u6a5f\u7387\u5206\u4f48\uff08\u901a\u5e38\u662f\u9ad8\u65af\u6df7\u5408\u6a21\u578b\uff09\uff0c\u800c\u4e0d\u662f\u4e00\u500b\u78ba\u5b9a\u7684\u9810\u6e2c\u3002\u9019\u500b M \u6a21\u578b\u69cb\u6210\u4e86\u4e16\u754c\u6a21\u578b\u7684\u9810\u6e2c\u6838\u5fc3\u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u63a7\u5236\u5668 (Controller, C)<\/strong>\uff1a\u9019\u662f\u4e00\u500b\u6975\u5176\u5c0f\u5de7\u548c\u7c21\u55ae\u7684\u7b56\u7565\u7db2\u8def\uff0c\u4f8b\u5982\uff0c\u53ef\u4ee5\u53ea\u662f\u4e00\u500b\u55ae\u5c64\u7684\u7dda\u6027\u6a21\u578b\u3002\u5b83\u7684\u8f38\u5165\u662f\u7576\u524d\u7684\u6f5b\u5728\u72c0\u614b <code>$z_t$<\/code> \u548c\u8a18\u61b6\u6a21\u578b\u7684\u96b1\u85cf\u72c0\u614b <code>$h_t$<\/code>\uff0c\u8f38\u51fa\u5247\u662f\u52d5\u4f5c <code>$a_t$<\/code>\u3002\u63a7\u5236\u5668\u7684\u6975\u7c21\u8a2d\u8a08\u662f\u4e16\u754c\u6a21\u578b\u7406\u5ff5\u7684\u95dc\u9375\uff1a\u5c07\u667a\u80fd\u9ad4\u7684\u7d55\u5927\u90e8\u5206\u8907\u96dc\u6027\uff08\u6578\u767e\u842c\u751a\u81f3\u66f4\u591a\u7684\u53c3\u6578\uff09\u90fd\u653e\u5728\u4e86 V \u548c M \u6a21\u578b\u4e2d\uff0c\u800c\u63a7\u5236\u554f\u984c\u5247\u5728\u4e00\u500b\u53c3\u6578\u6975\u5c11\uff08\u53ef\u80fd\u53ea\u6709\u5e7e\u767e\u500b\uff09\u7684\u7a7a\u9593\u4e2d\u89e3\u6c7a \u3002 \u00a0<\/li>\n<\/ol>\n\n\n\n<p>\u4e16\u754c\u6a21\u578b\u7684\u7cbe\u5999\u4e4b\u8655\u5728\u65bc\uff0c\u5b83\u6c92\u6709\u8a66\u5716\u5728\u50cf\u7d20\u7d1a\u5225\u4e0a\u9810\u6e2c\u672a\u4f86\uff0c\u800c\u662f\u9996\u5148\u900f\u904e\u7121\u76e3\u7763\u5b78\u7fd2\uff08VAE\uff09\u5275\u5efa\u4e86\u4e00\u500b\u66f4\u7c21\u55ae\u3001\u66f4\u62bd\u8c61\u7684\u300c\u904a\u6a02\u5834\u300d\uff08\u5373\u6f5b\u5728\u7a7a\u9593\uff09\uff0c\u7136\u5f8c\u5728\u9019\u500b\u904a\u6a02\u5834\u88e1\u5b78\u7fd2\u52d5\u614b\u6a21\u578b\uff08MDN-RNN\uff09\u3002\u9019\u4f7f\u5f97\u5b78\u7fd2\u4e00\u500b\u6709\u7528\u7684\u9810\u6e2c\u6a21\u578b\u8b8a\u5f97\u66f4\u52a0\u53ef\u884c\u548c\u9ad8\u6548\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">3.3 \u5728\u5e7b\u89ba\u73fe\u5be6\u4e2d\u8a13\u7df4\uff1a\u60f3\u50cf\u7684\u529b\u91cf<\/h3>\n\n\n\n<p>\u4e16\u754c\u6a21\u578b\u6700\u5f15\u4eba\u6ce8\u76ee\u7684\u7279\u9ede\u4e4b\u4e00\u662f\u5176\u65b0\u7a4e\u7684\u5169\u968e\u6bb5\u8a13\u7df4\u6d41\u7a0b\uff0c\u7279\u5225\u662f\u63a7\u5236\u5668\uff08C\uff09\u7684\u8a13\u7df4\u65b9\u5f0f\uff0c\u5373\u5b8c\u5168\u5728\u5176\u7531 M \u6a21\u578b\u751f\u6210\u7684\u300c\u5922\u5883\u300d\u6216\u300c\u5e7b\u89ba\u73fe\u5be6\u300d\u4e2d\u9032\u884c <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p><strong>\u8a13\u7df4\u6d41\u7a0b\u8a73\u89e3<\/strong><\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u7b2c\u4e00\u968e\u6bb5\uff1a\u5b78\u7fd2\u4e16\u754c\u6a21\u578b\uff08\u7121\u76e3\u7763\uff09<\/strong> \u9996\u5148\uff0c\u667a\u80fd\u9ad4\u900f\u904e\u5728\u771f\u5be6\u74b0\u5883\u4e2d\u57f7\u884c\u96a8\u6a5f\u7b56\u7565\u4f86\u6536\u96c6\u5927\u91cf\u7684\u539f\u59cb\u89c0\u6e2c\u6578\u64da\uff08\u4f8b\u5982\uff0c\u6578\u5343\u6b21\u904a\u6232\u7684\u756b\u9762\u5e8f\u5217\uff09\u3002\u7136\u5f8c\uff0c\u5229\u7528\u9019\u4e9b\u6578\u64da\uff0c\u5728\u5b8c\u5168\u7121\u76e3\u7763\u7684\u65b9\u5f0f\u4e0b\u8a13\u7df4 V \u6a21\u578b\u548c M \u6a21\u578b\u3002V \u6a21\u578b\uff08VAE\uff09\u5b78\u7fd2\u5982\u4f55\u5c07\u6bcf\u4e00\u5e40\u5716\u50cf\u58d3\u7e2e\u6210\u6f5b\u5728\u5411\u91cf <code>$z$<\/code>\uff0c\u800c M \u6a21\u578b\uff08MDN-RNN\uff09\u5247\u5b78\u7fd2\u5728\u7d66\u5b9a\u7576\u524d\u6f5b\u5728\u72c0\u614b <code>$z_t$<\/code> \u548c\u52d5\u4f5c <code>$a_t$<\/code> \u7684\u60c5\u6cc1\u4e0b\uff0c\u5982\u4f55\u9810\u6e2c\u4e0b\u4e00\u500b\u6f5b\u5728\u72c0\u614b <code>$z_{t+1}$<\/code> \u7684\u6a5f\u7387\u5206\u4f48 \u3002\u9019\u500b\u968e\u6bb5\u7d50\u675f\u5f8c\uff0c\u6211\u5011\u5c31\u5f97\u5230\u4e86\u4e00\u500b\u53ef\u4ee5\u6a21\u64ec\u6f5b\u5728\u7a7a\u9593\u52d5\u614b\u7684\u300c\u4e16\u754c\u6a21\u578b\u300d\u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u7b2c\u4e8c\u968e\u6bb5\uff1a\u5728\u300c\u5922\u5883\u300d\u4e2d\u8a13\u7df4\u63a7\u5236\u5668<\/strong> \u4e00\u65e6\u4e16\u754c\u6a21\u578b\u8a13\u7df4\u5b8c\u6210\uff0c\u63a7\u5236\u5668\u7684\u8a13\u7df4\u5c31\u53ef\u4ee5<strong>\u5b8c\u5168\u812b\u96e2\u771f\u5be6\u74b0\u5883<\/strong>\u9032\u884c\u3002\u6574\u500b\u904e\u7a0b\u767c\u751f\u5728\u7531 M \u6a21\u578b\u5275\u9020\u7684\u865b\u64ec\u74b0\u5883\u4e2d \u3002\n<ul class=\"wp-block-list\">\n<li><strong>\u5922\u5883\u7684\u904b\u4f5c\u65b9\u5f0f<\/strong>\uff1aM \u6a21\u578b\u6b64\u6642\u626e\u6f14\u4e86\u4e00\u500b\u5b8c\u6574\u7684\u6a21\u64ec\u5668\u89d2\u8272\u3002\u63a7\u5236\u5668 C \u63a5\u6536\u4e00\u500b\u521d\u59cb\u7684\u6f5b\u5728\u72c0\u614b <code>$z_t$<\/code> \u548c\u96b1\u85cf\u72c0\u614b <code>$h_t$<\/code>\uff0c\u8f38\u51fa\u4e00\u52d5\u4f5c <code>$a_t$<\/code>\u3002\u9019\u500b\u52d5\u4f5c\u8207 <code>$z_t, h_t$<\/code> \u4e00\u540c\u88ab\u9001\u5165 M \u6a21\u578b\uff0cM \u6a21\u578b\u5247\u5f9e\u5176\u9810\u6e2c\u7684\u6a5f\u7387\u5206\u4f48\u4e2d\u63a1\u6a23\u51fa\u4e0b\u4e00\u500b\u6f5b\u5728\u72c0\u614b <code>$z_{t+1}$<\/code>\u3002\u9019\u500b\u904e\u7a0b\u53ef\u4ee5\u4e0d\u65b7\u91cd\u8907\uff0c\u5f9e\u800c\u5728\u6f5b\u5728\u7a7a\u9593\u4e2d\u751f\u6210\u4e00\u689d\u5b8c\u6574\u7684\u300c\u5e7b\u89ba\u300d\u8ecc\u8de1\u3002<\/li>\n\n\n\n<li><strong>\u512a\u5316\u63a7\u5236\u5668<\/strong>\uff1a\u5728\u9019\u500b\u865b\u64ec\u7684\u3001\u57fa\u65bc\u6f5b\u5728\u7a7a\u9593\u7684\u74b0\u5883\u4e2d\uff0c\u63a7\u5236\u5668 C \u7684\u76ee\u6a19\u662f\u6700\u5927\u5316\u7d2f\u7a4d\u734e\u52f5\u3002\u7531\u65bc\u63a7\u5236\u5668 C \u7684\u53c3\u6578\u975e\u5e38\u5c11\uff0c\u53ef\u4ee5\u4f7f\u7528\u5c0d\u68af\u5ea6\u4e0d\u654f\u611f\u7684\u6f14\u5316\u7b56\u7565\uff08Evolution Strategies\uff09\uff0c\u5982\u5354\u65b9\u5dee\u77e9\u9663\u81ea\u9069\u61c9\u6f14\u5316\u7b56\u7565\uff08CMA-ES\uff09\uff0c\u4f86\u9032\u884c\u9ad8\u6548\u7684\u512a\u5316\u3002CMA-ES \u53ea\u9700\u8981\u6bcf\u689d\u8ecc\u8de1\u7684\u6700\u7d42\u7e3d\u56de\u5831\uff0c\u800c\u4e0d\u9700\u8981\u68af\u5ea6\u4fe1\u606f\uff0c\u975e\u5e38\u9069\u5408\u9019\u7a2e\u9ed1\u7bb1\u512a\u5316\u554f\u984c \u3002 \u00a0<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n\n\n\n<p><strong>\u512a\u52e2\u8207\u7b56\u7565\u8f49\u79fb<\/strong> \u9019\u7a2e\u300c\u5728\u5922\u4e2d\u8a13\u7df4\u300d\u7684\u6a21\u5f0f\u5e36\u4f86\u4e86\u5de8\u5927\u7684\u597d\u8655\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u6975\u9ad8\u7684\u8a08\u7b97\u6548\u7387<\/strong>\uff1a\u5728\u4f4e\u7dad\u7684\u6f5b\u5728\u7a7a\u9593\u4e2d\u9032\u884c\u6a21\u64ec\uff0c\u9060\u6bd4\u904b\u884c\u4e00\u500b\u5b8c\u6574\u7684\u7269\u7406\u5f15\u64ce\u6216\u5716\u5f62\u6e32\u67d3\u5668\u4f86\u751f\u6210\u771f\u5be6\u74b0\u5883\u7684\u4e0b\u4e00\u5e40\u8981\u5feb\u5f97\u591a\u3002\u9019\u4f7f\u5f97\u667a\u80fd\u9ad4\u53ef\u4ee5\u5728\u6975\u77ed\u7684\u6642\u9593\u5167\u300c\u60f3\u50cf\u300d\u4e26\u8a55\u4f30\u6578\u767e\u842c\u7a2e\u53ef\u80fd\u7684\u672a\u4f86\uff0c\u5f9e\u800c\u5be6\u73fe\u4e86\u6a21\u578b\u57fa\u790e RL \u7684\u6a23\u672c\u6548\u7387\u512a\u52e2\uff0c\u540c\u6642\u907f\u514d\u4e86\u50cf\u7d20\u7d1a\u9810\u6e2c\u7684\u56f0\u96e3 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u7b56\u7565\u7684\u76f4\u63a5\u8f49\u79fb<\/strong>\uff1a\u7531\u65bc\u63a7\u5236\u5668 C \u5b78\u7fd2\u5230\u7684\u7b56\u7565\u662f\u57fa\u65bc\u7531 V \u548c M \u6a21\u578b\u751f\u6210\u7684\u58d3\u7e2e\u8868\u5fb5 <code>$z_t$<\/code> \u548c <code>$h_t$<\/code>\uff0c\u800c\u9019\u5169\u500b\u6a21\u578b\u5728\u771f\u5be6\u74b0\u5883\u548c\u5922\u5883\u74b0\u5883\u4e2d\u90fd\u4ee5\u540c\u6a23\u7684\u65b9\u5f0f\u904b\u4f5c\uff0c\u56e0\u6b64\uff0c\u5728\u5922\u5883\u4e2d\u8a13\u7df4\u597d\u7684\u63a7\u5236\u5668\u53ef\u4ee5\u88ab<strong>\u76f4\u63a5\u90e8\u7f72\u56de\u771f\u5be6\u74b0\u5883<\/strong>\u4e2d\uff0c\u8207 V \u548c M \u6a21\u578b\u7d50\u5408\uff0c\u63a7\u5236\u667a\u80fd\u9ad4\u8207\u771f\u5be6\u4e16\u754c\u4e92\u52d5 \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u4e16\u754c\u6a21\u578b\u7684\u7bc4\u5f0f\u8f49\u8b8a\u2014\u2014\u5c07\u611f\u77e5\/\u9810\u6e2c\u8207\u63a7\u5236\u89e3\u8026\u2014\u2014\u70ba\u6a21\u578b\u57fa\u790e\u5f37\u5316\u5b78\u7fd2\u958b\u95e2\u4e86\u65b0\u7684\u9053\u8def\uff0c\u4e26\u555f\u767c\u4e86\u5f8c\u7e8c\u4e00\u7cfb\u5217\u6210\u529f\u7684\u6a21\u578b\u57fa\u790e\u667a\u80fd\u9ad4\uff0c\u5982 Dreamer \u7cfb\u5217 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u7b2c\u56db\u90e8\u5206\uff1a\u5411\u5c08\u5bb6\u5b78\u7fd2 &#8211; \u6a21\u4eff\u5b78\u7fd2\u7bc4\u5f0f<\/h2>\n\n\n\n<p>\u524d\u9762\u7684\u90e8\u5206\u90fd\u57fa\u65bc\u4e00\u500b\u5171\u540c\u7684\u5047\u8a2d\uff1a\u74b0\u5883\u80fd\u5920\u63d0\u4f9b\u4e00\u500b\u660e\u78ba\u7684\u734e\u52f5\u51fd\u6578\uff08reward function\uff09\uff0c\u4ee5\u6307\u5c0e\u667a\u80fd\u9ad4\u7684\u5b78\u7fd2\u3002\u7136\u800c\uff0c\u5728\u8a31\u591a\u73fe\u5be6\u4e16\u754c\u7684\u61c9\u7528\u4e2d\uff0c\u8a2d\u8a08\u4e00\u500b\u80fd\u5920\u6e96\u78ba\u3001\u7121\u6b67\u7fa9\u5730\u63cf\u8ff0\u4efb\u52d9\u76ee\u6a19\u7684\u734e\u52f5\u51fd\u6578\u672c\u8eab\u5c31\u662f\u4e00\u500b\u5de8\u5927\u7684\u6311\u6230\u3002\u672c\u90e8\u5206\u5c07\u63a2\u8a0e\u4e00\u500b\u5b8c\u5168\u4e0d\u540c\u7684\u5b78\u7fd2\u7bc4\u5f0f\u2014\u2014\u6a21\u4eff\u5b78\u7fd2\uff08Imitation Learning\uff09\uff0c\u5b83\u89e3\u6c7a\u4e86\u4e00\u500b\u66f4\u6839\u672c\u7684\u554f\u984c\uff1a\u7576\u6c92\u6709\u734e\u52f5\u51fd\u6578\u6642\uff0c\u667a\u80fd\u9ad4\u5982\u4f55\u5b78\u7fd2\uff1f\u6211\u5011\u5c07\u8ffd\u8e64\u5176\u5f9e\u7c21\u55ae\u4f46\u6709\u7f3a\u9677\u7684\u884c\u70ba\u514b\u9686\uff0c\u6f14\u5316\u5230\u66f4\u7a69\u5065\u3001\u66f4\u5f37\u5927\u7684\u5c0d\u6297\u6027\u65b9\u6cd5\u7684\u6b77\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4.1 \u734e\u52f5\u5de5\u7a0b\u7684\u6311\u6230\u8207\u6a21\u4eff\u7684\u627f\u8afe<\/h3>\n\n\n\n<p>\u6a19\u6e96\u5f37\u5316\u5b78\u7fd2\u6846\u67b6\u7684\u6838\u5fc3\u662f\u6700\u5927\u5316\u7d2f\u7a4d\u734e\u52f5\u3002\u7136\u800c\uff0c\u9019\u500b\u6846\u67b6\u7684\u6709\u6548\u6027\u56b4\u91cd\u4f9d\u8cf4\u65bc\u4e00\u500b\u7cbe\u5fc3\u8a2d\u8a08\u7684\u734e\u52f5\u51fd\u6578\u3002\u5728\u8a31\u591a\u8907\u96dc\u7684\u73fe\u5be6\u4efb\u52d9\u4e2d\uff0c\u4f8b\u5982\u81ea\u52d5\u99d5\u99db\u3001\u6a5f\u5668\u4eba\u64cd\u4f5c\u6216\u5c0d\u8a71\u7cfb\u7d71\uff0c\u7528\u6578\u5b78\u8a9e\u8a00\u7cbe\u78ba\u5b9a\u7fa9\u671f\u671b\u7684\u884c\u70ba\u662f\u975e\u5e38\u56f0\u96e3\u7684 <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u4e00\u500b\u8a2d\u8a08\u4e0d\u7576\u7684\u734e\u52f5\u51fd\u6578\u5f88\u5bb9\u6613\u5c0e\u81f4\u300c\u734e\u52f5\u99ed\u5ba2\u300d\uff08reward hacking\uff09\u73fe\u8c61\uff1a\u667a\u80fd\u9ad4\u6703\u627e\u5230\u734e\u52f5\u51fd\u6578\u7684\u6f0f\u6d1e\u6216\u6377\u5f91\uff0c\u4ee5\u6700\u5927\u5316\u5176\u6578\u503c\u734e\u52f5\uff0c\u4f46\u5176\u884c\u70ba\u537b\u8207\u8a2d\u8a08\u8005\u7684\u521d\u8877\u5927\u76f8\u5f91\u5ead <sup><\/sup>\u3002\u4f8b\u5982\uff0c\u4e00\u500b\u88ab\u734e\u52f5\u300c\u6536\u96c6\u5783\u573e\u300d\u7684\u6e05\u6f54\u6a5f\u5668\u4eba\uff0c\u53ef\u80fd\u6703\u5b78\u6703\u5c07\u5783\u573e\u6254\u5230\u5730\u4e0a\u518d\u64bf\u8d77\u4f86\uff0c\u4ee5\u6b64\u5faa\u74b0\u4f86\u5237\u5206\u3002 &nbsp;<\/p>\n\n\n\n<p><strong>\u6a21\u4eff\u5b78\u7fd2\uff08Imitation Learning, IL\uff09<\/strong>\uff0c\u4e5f\u88ab\u7a31\u70ba<strong>\u5f9e\u793a\u7bc4\u4e2d\u5b78\u7fd2\uff08Learning from Demonstrations, LfD\uff09<\/strong>\uff0c\u70ba\u9019\u500b\u96e3\u984c\u63d0\u4f9b\u4e86\u53e6\u4e00\u7a2e\u89e3\u6c7a\u65b9\u6848\u3002\u5176\u6838\u5fc3\u601d\u60f3\u662f\uff0c\u8207\u5176\u8b93\u5de5\u7a0b\u5e2b\u8cbb\u76e1\u5fc3\u6a5f\u5730\u8a2d\u8a08\u734e\u52f5\u51fd\u6578\uff0c\u4e0d\u5982\u8b93\u667a\u80fd\u9ad4\u76f4\u63a5\u5f9e\u5c08\u5bb6\u7684\u793a\u7bc4\u4e2d\u5b78\u7fd2 <sup><\/sup>\u3002\u5728\u8a31\u591a\u60c5\u6cc1\u4e0b\uff0c\u8b93\u4eba\u985e\u5c08\u5bb6 &nbsp;<\/p>\n\n\n\n<p><strong>\u5c55\u793a<\/strong>\u5982\u4f55\u5b8c\u6210\u4e00\u9805\u4efb\u52d9\uff08\u4f8b\u5982\uff0c\u958b\u8eca\u3001\u758a\u8863\u670d\uff09\uff0c\u9060\u6bd4\u8b93\u4ed6\u5011<strong>\u63cf\u8ff0<\/strong>\u5b8c\u6210\u8a72\u4efb\u52d9\u7684\u734e\u52f5\u51fd\u6578\u8981\u5bb9\u6613\u5f97\u591a <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5728\u6a21\u4eff\u5b78\u7fd2\u7684\u6846\u67b6\u4e0b\uff0c\u667a\u80fd\u9ad4\u4e0d\u518d\u63a5\u6536\u4f86\u81ea\u74b0\u5883\u7684\u734e\u52f5\u8a0a\u865f\uff0c\u53d6\u800c\u4ee3\u4e4b\u7684\u662f\u4e00\u500b\u7531\u5c08\u5bb6\u793a\u7bc4\u7d44\u6210\u7684\u6578\u64da\u96c6\uff0c\u5176\u4e2d\u5305\u542b\u4e86\u5c08\u5bb6\u5728\u4e0d\u540c\u72c0\u614b\u4e0b\u6240\u63a1\u53d6\u7684\u52d5\u4f5c\u5e8f\u5217\u3002\u667a\u80fd\u9ad4\u7684\u76ee\u6a19\u662f\u5b78\u7fd2\u4e00\u500b\u7b56\u7565\uff0c\u4f7f\u5176\u884c\u70ba\u76e1\u53ef\u80fd\u5730\u63a5\u8fd1\u5c08\u5bb6\u7684\u884c\u70ba\u3002\u9019\u7a2e\u65b9\u6cd5\u5c07\u554f\u984c\u5f9e\u300c\u5982\u4f55\u6700\u5927\u5316\u734e\u52f5\u300d\u8f49\u8b8a\u70ba\u300c\u5982\u4f55\u6a21\u4eff\u5c08\u5bb6\u300d\uff0c\u70ba\u5728\u6c92\u6709\u660e\u78ba\u734e\u52f5\u8a0a\u865f\u7684\u5834\u666f\u4e2d\u8a13\u7df4\u667a\u80fd\u9ad4\u63d0\u4f9b\u4e86\u53ef\u884c\u7684\u9014\u5f91\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4.2 \u884c\u70ba\u514b\u9686\u53ca\u5176\u81f4\u547d\u7f3a\u9677\uff1a\u5171\u8b8a\u6578\u504f\u79fb\u554f\u984c<\/h3>\n\n\n\n<p>\u6a21\u4eff\u5b78\u7fd2\u4e2d\u6700\u76f4\u89c0\u3001\u6700\u7c21\u55ae\u7684\u65b9\u6cd5\u662f<strong>\u884c\u70ba\u514b\u9686\uff08Behavior Cloning, BC\uff09<\/strong><sup><\/sup>\u3002BC \u5c07\u6a21\u4eff\u554f\u984c\u76f4\u63a5\u8f49\u5316\u70ba\u4e00\u500b\u6a19\u6e96\u7684 &nbsp;<\/p>\n\n\n\n<p><strong>\u76e3\u7763\u5f0f\u5b78\u7fd2<\/strong>\u554f\u984c\u3002\u5b83\u5c07\u5c08\u5bb6\u793a\u7bc4\u6578\u64da\u96c6\u4e2d\u7684\u72c0\u614b\uff08state\uff09\u8996\u70ba\u8f38\u5165\u7279\u5fb5\uff08X\uff09\uff0c\u5c07\u5c08\u5bb6\u63a1\u53d6\u7684\u52d5\u4f5c\uff08action\uff09\u8996\u70ba\u6a19\u7c64\uff08Y\uff09\u3002\u7136\u5f8c\uff0c\u8a13\u7df4\u4e00\u500b\u7b56\u7565\u7db2\u8def <code>$\\pi(a|s)$<\/code> \u4f86\u5b78\u7fd2\u5f9e\u72c0\u614b\u5230\u52d5\u4f5c\u7684\u6620\u5c04\uff0c\u5176\u76ee\u6a19\u662f\u6700\u5c0f\u5316\u7db2\u8def\u9810\u6e2c\u7684\u52d5\u4f5c\u8207\u5c08\u5bb6\u5be6\u969b\u63a1\u53d6\u7684\u52d5\u4f5c\u4e4b\u9593\u7684\u5dee\u7570\uff08\u4f8b\u5982\uff0c\u4f7f\u7528\u5747\u65b9\u8aa4\u5dee\u6216\u4ea4\u53c9\u71b5\u640d\u5931\uff09<sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u884c\u70ba\u514b\u9686\u56e0\u5176\u7c21\u55ae\u6027\u800c\u88ab\u5ee3\u6cdb\u61c9\u7528\uff0c\u4f8b\u5982\uff0c\u65e9\u671f\u7684\u81ea\u52d5\u99d5\u99db\u7cfb\u7d71 ALVINN \u5c31\u662f\u900f\u904e\u6a21\u4eff\u4eba\u985e\u99d5\u99db\u54e1\u7684\u8f49\u5411\u64cd\u4f5c\u4f86\u5b78\u7fd2\u99d5\u99db\u7684 <sup><\/sup>\u3002\u7136\u800c\uff0c\u9019\u7a2e\u770b\u4f3c\u7c21\u55ae\u7684\u65b9\u6cd5\u5b58\u5728\u4e00\u500b\u81f4\u547d\u7684\u7f3a\u9677\uff0c\u5373**\u5171\u8b8a\u6578\u504f\u79fb\uff08Covariate Shift\uff09**\u554f\u984c <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5171\u8b8a\u6578\u504f\u79fb\u662f\u6307\u6a21\u578b\u5728<strong>\u8a13\u7df4\u6642\u7684\u8f38\u5165\u6578\u64da\u5206\u4f48<\/strong>\u8207<strong>\u6e2c\u8a66\uff08\u6216\u90e8\u7f72\uff09\u6642\u7684\u8f38\u5165\u6578\u64da\u5206\u4f48<\/strong>\u4e0d\u4e00\u81f4\u7684\u73fe\u8c61\u3002\u5728\u884c\u70ba\u514b\u9686\u7684\u80cc\u666f\u4e0b\uff0c\u9019\u500b\u554f\u984c\u8868\u73fe\u70ba\uff1a<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u8a13\u7df4\u5206\u4f48<\/strong>\uff1a\u7b56\u7565\u7db2\u8def\u662f\u5728\u5c08\u5bb6\u6240\u7d93\u6b77\u7684\u72c0\u614b\u5206\u4f48\u4e0a\u9032\u884c\u8a13\u7df4\u7684\u3002\u5c08\u5bb6\u7531\u65bc\u5176\u9ad8\u8d85\u7684\u6280\u5de7\uff0c\u5176\u8a2a\u554f\u7684\u72c0\u614b\u901a\u5e38\u5c40\u9650\u65bc\u4e00\u689d\u300c\u6700\u512a\u300d\u8ecc\u8de1\u5468\u570d\u3002<\/li>\n\n\n\n<li><strong>\u6e2c\u8a66\u5206\u4f48<\/strong>\uff1a\u7576\u5b78\u7fd2\u5230\u7684\u7b56\u7565\u88ab\u90e8\u7f72\u5230\u74b0\u5883\u4e2d\u6642\uff0c\u7531\u65bc\u51fd\u6578\u903c\u8fd1\u7684\u8aa4\u5dee\uff0c\u5b83\u4e0d\u53ef\u907f\u514d\u5730\u6703\u72af\u4e00\u4e9b\u5c0f\u932f\u8aa4\u3002\u9019\u4e9b\u5c0f\u932f\u8aa4\u6703\u5c07\u667a\u80fd\u9ad4\u5f15\u5c0e\u5230\u4e00\u500b\u7a0d\u5fae\u504f\u96e2\u5c08\u5bb6\u8ecc\u8de1\u7684\u65b0\u72c0\u614b \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u8aa4\u5dee\u7684\u7d1a\u806f\u6548\u61c9<\/strong>\uff1a\u7531\u65bc\u9019\u500b\u65b0\u72c0\u614b\u5f9e\u672a\u5728\u5c08\u5bb6\u7684\u793a\u7bc4\u6578\u64da\u4e2d\u51fa\u73fe\u904e\uff0c\u7b56\u7565\u7db2\u8def\u4e0d\u77e5\u9053\u5728\u9019\u7a2e\u60c5\u6cc1\u4e0b\u61c9\u8a72\u5982\u4f55\u6b63\u78ba\u884c\u52d5\uff0c\u56e0\u6b64\u5f88\u53ef\u80fd\u6703\u72af\u4e0b\u66f4\u5927\u7684\u932f\u8aa4\u3002\u9019\u500b\u66f4\u5927\u7684\u932f\u8aa4\u53c8\u6703\u5c07\u667a\u80fd\u9ad4\u5e36\u5230\u4e00\u500b\u66f4\u52a0\u964c\u751f\u7684\u72c0\u614b\uff0c\u5982\u6b64\u60e1\u6027\u5faa\u74b0\uff0c\u5c0e\u81f4\u8aa4\u5dee\u50cf\u6efe\u96ea\u7403\u4e00\u6a23\u4e0d\u65b7\u7d2f\u7a4d\uff0c\u6700\u7d42\u4f7f\u667a\u80fd\u9ad4\u5b8c\u5168\u504f\u96e2\u6b63\u5e38\u8ecc\u9053\uff0c\u5c0e\u81f4\u4efb\u52d9\u5931\u6557 \u3002 \u00a0<\/li>\n<\/ol>\n\n\n\n<p>\u9019\u500b\u554f\u984c\u7684\u6839\u672c\u539f\u56e0\u5728\u65bc\uff0cBC \u9055\u53cd\u4e86\u76e3\u7763\u5f0f\u5b78\u7fd2\u7684 I.I.D. \u5047\u8a2d\u3002\u5728\u5e8f\u5217\u6c7a\u7b56\u554f\u984c\u4e2d\uff0c\u667a\u80fd\u9ad4\u81ea\u8eab\u7684\u52d5\u4f5c\u6703\u5f71\u97ff\u5176\u672a\u4f86\u89c0\u6e2c\u5230\u7684\u72c0\u614b\uff0c\u5f9e\u800c\u6539\u8b8a\u4e86\u6578\u64da\u7684\u5206\u4f48\u3002BC \u50c5\u50c5\u5b78\u7fd2\u4e86\u5728\u300c\u5c08\u5bb6\u6703\u9047\u5230\u7684\u72c0\u614b\u300d\u4e0b\u8a72\u600e\u9ebc\u505a\uff0c\u537b\u6c92\u6709\u5b78\u6703\u5982\u4f55\u5f9e\u300c\u81ea\u5df1\u72af\u932f\u5f8c\u6703\u9047\u5230\u7684\u72c0\u614b\u300d\u4e2d\u6062\u5fa9\u904e\u4f86\u3002\u9019\u7a2e\u5206\u4f48\u4e0d\u5339\u914d\u5c0e\u81f4\u4e86 BC \u7684\u6cdb\u5316\u80fd\u529b\u5f88\u5dee\uff0c\u4f7f\u5176\u5728\u8a31\u591a\u5be6\u969b\u61c9\u7528\u4e2d\u8868\u73fe\u5f97\u975e\u5e38\u8106\u5f31\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4.3 \u900f\u904e\u795e\u8aed\u4fee\u6b63\u8def\u7dda\uff1a\u6578\u64da\u96c6\u805a\u5408 (DAgger)<\/h3>\n\n\n\n<p>\u70ba\u4e86\u89e3\u6c7a\u884c\u70ba\u514b\u9686\u4e2d\u81f4\u547d\u7684\u5171\u8b8a\u6578\u504f\u79fb\u554f\u984c\uff0cSt\u00e9phane Ross \u548c J. Andrew Bagnell \u63d0\u51fa\u4e86\u4e00\u7a2e\u512a\u96c5\u800c\u6709\u6548\u7684\u89e3\u6c7a\u65b9\u6848\uff0c\u540d\u70ba\u6578\u64da\u96c6\u805a\u5408\uff08Dataset Aggregation, DAgger\uff09\u6f14\u7b97\u6cd5 <sup><\/sup>\u3002\u8207\u88ab\u52d5\u5730\u5b78\u7fd2\u4e00\u500b\u975c\u614b\u6578\u64da\u96c6\u7684 BC \u4e0d\u540c\uff0cDAgger \u662f\u4e00\u7a2e &nbsp;<\/p>\n\n\n\n<p>\u4e92\u52d5\u5f0f\uff08interactive\uff09\u7684\u6f14\u7b97\u6cd5\uff0c\u5b83\u9700\u8981\u4e00\u500b\u80fd\u5920\u5728\u7dda\u67e5\u8a62\u7684\u300c\u5c08\u5bb6\u795e\u8aed\u300d\uff08expert oracle\uff09<sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>DAgger \u7684\u6838\u5fc3\u601d\u60f3\u662f\uff0c\u65e2\u7136\u554f\u984c\u51fa\u5728\u8a13\u7df4\u6578\u64da\u7684\u5206\u4f48\u8207\u5b78\u7fd2\u8005\u5be6\u969b\u9047\u5230\u7684\u72c0\u614b\u5206\u4f48\u4e0d\u5339\u914d\uff0c\u90a3\u9ebc\u6211\u5011\u5c31\u61c9\u8a72\u8b93\u5b78\u7fd2\u8005\u5728\u81ea\u5df1\u5c07\u8981\u9047\u5230\u7684\u72c0\u614b\u5206\u4f48\u4e0a\u9032\u884c\u8a13\u7df4\u3002\u5b83\u900f\u904e\u4e00\u500b\u8fed\u4ee3\u7684\u5faa\u74b0\u904e\u7a0b\u4f86\u5be6\u73fe\u9019\u4e00\u9ede <sup><\/sup>\uff1a &nbsp;<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u521d\u59cb\u5316<\/strong>\uff1a\u9996\u5148\uff0c\u4f7f\u7528\u521d\u59cb\u7684\u5c08\u5bb6\u793a\u7bc4\u6578\u64da\u96c6\u8a13\u7df4\u4e00\u500b\u521d\u59cb\u7b56\u7565 <code>$\\pi_1$<\/code>\uff08\u9019\u4e00\u6b65\u7b49\u540c\u65bc\u6a19\u6e96\u7684\u884c\u70ba\u514b\u9686\uff09\u3002<\/li>\n\n\n\n<li><strong>\u57f7\u884c\u8207\u6536\u96c6<\/strong>\uff1a\u5728\u7b2c <code>i<\/code> \u6b21\u8fed\u4ee3\u4e2d\uff0c\u8b93\u7576\u524d\u7684\u7b56\u7565 <code>$\\pi_i$<\/code> \u5728\u74b0\u5883\u4e2d\u904b\u884c\uff0c\u6536\u96c6\u4e00\u689d\u6216\u591a\u689d\u65b0\u7684\u8ecc\u8de1\u3002\u9019\u689d\u8ecc\u8de1\u6240\u5305\u542b\u7684\u72c0\u614b\uff0c\u6b63\u662f\u7b56\u7565 <code>$\\pi_i$<\/code> \u5728\u5be6\u969b\u57f7\u884c\u6642\u6700\u6709\u53ef\u80fd\u9047\u5230\u7684\u72c0\u614b\u3002<\/li>\n\n\n\n<li><strong>\u67e5\u8a62\u5c08\u5bb6<\/strong>\uff1a\u5c0d\u65bc\u5728\u9019\u689d\u65b0\u8ecc\u8de1\u4e0a\u6536\u96c6\u5230\u7684\u6bcf\u4e00\u500b\u72c0\u614b <code>s<\/code>\uff0c\u5411\u5c08\u5bb6\u795e\u8aed\u67e5\u8a62\uff1a\u300c\u5728\u9019\u7a2e\u60c5\u6cc1\u4e0b\uff0c\u4f60\uff08\u5c08\u5bb6\uff09\u6703\u63a1\u53d6\u4ec0\u9ebc\u52d5\u4f5c\uff1f\u300d\u5c08\u5bb6\u6703\u7d66\u51fa\u6b63\u78ba\u7684\u52d5\u4f5c\u6a19\u7c64 <code>$\\pi^*(s)$<\/code>\u3002<\/li>\n\n\n\n<li><strong>\u6578\u64da\u96c6\u805a\u5408<\/strong>\uff1a\u5c07\u65b0\u6536\u96c6\u5230\u7684\u72c0\u614b-\u5c08\u5bb6\u52d5\u4f5c\u5c0d <code>$(s, \\pi^*(s))$<\/code> <strong>\u805a\u5408<\/strong>\u5230\u4e00\u500b\u4e0d\u65b7\u589e\u9577\u7684\u7e3d\u6578\u64da\u96c6\u4e2d\u3002\u9019\u500b\u7e3d\u6578\u64da\u96c6\u5305\u542b\u4e86\u4e4b\u524d\u6240\u6709\u8fed\u4ee3\u4e2d\u6536\u96c6\u5230\u7684\u6578\u64da\u3002<\/li>\n\n\n\n<li><strong>\u91cd\u65b0\u8a13\u7df4<\/strong>\uff1a\u4f7f\u7528\u9019\u500b\u805a\u5408\u5f8c\u7684\u3001\u898f\u6a21\u66f4\u5927\u3001\u5206\u4f48\u66f4\u5ee3\u7684\u6578\u64da\u96c6\uff0c\u91cd\u65b0\u8a13\u7df4\u4e00\u500b\u65b0\u7684\u3001\u66f4\u597d\u7684\u7b56\u7565 <code>$\\pi_{i+1}$<\/code>\u3002<\/li>\n\n\n\n<li><strong>\u91cd\u8907<\/strong>\uff1a\u91cd\u8907\u6b65\u9a5f 2 \u5230 5\uff0c\u76f4\u5230\u7b56\u7565\u6027\u80fd\u6536\u6582\u3002<\/li>\n<\/ol>\n\n\n\n<p><strong>DAgger \u70ba\u4f55\u6709\u6548\uff1f<\/strong> DAgger \u900f\u904e\u9019\u7a2e\u8fed\u4ee3\u5f0f\u7684\u300c\u57f7\u884c-\u67e5\u8a62-\u805a\u5408-\u518d\u8a13\u7df4\u300d\u5faa\u74b0\uff0c\u76f4\u63a5\u5730\u89e3\u6c7a\u4e86\u5171\u8b8a\u6578\u504f\u79fb\u554f\u984c\u3002\u5728\u6bcf\u4e00\u6b21\u8fed\u4ee3\u4e2d\uff0c\u5b83\u8feb\u4f7f\u8a13\u7df4\u6578\u64da\u7684\u5206\u4f48\u9010\u6f38\u5411\u5b78\u7fd2\u8005\u81ea\u8eab\u7684\u72c0\u614b\u5206\u4f48\u9760\u650f <sup><\/sup>\u3002\u7576\u5b78\u7fd2\u8005\u72af\u932f\u4e26\u9032\u5165\u4e00\u500b\u964c\u751f\u72c0\u614b\u6642\uff0cDAgger \u6703\u8a18\u9304\u4e0b\u9019\u500b\u72c0\u614b\uff0c\u4e26\u5f9e\u5c08\u5bb6\u90a3\u88e1\u7372\u5f97\u6b63\u78ba\u7684\u61c9\u5c0d\u65b9\u5f0f\u3002\u9019\u4e9b\u300c\u7cfe\u932f\u300d\u6578\u64da\u88ab\u52a0\u5165\u5230\u8a13\u7df4\u96c6\u4e2d\uff0c\u4f7f\u5f97\u4e0b\u4e00\u4ee3\u7684\u7b56\u7565\u5b78\u6703\u4e86\u5982\u4f55\u5f9e\u985e\u4f3c\u7684\u932f\u8aa4\u4e2d\u6062\u5fa9\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5f9e\u672c\u8cea\u4e0a\u8b1b\uff0cDAgger \u8b93\u667a\u80fd\u9ad4\u5728\u8a13\u7df4\u904e\u7a0b\u4e2d\u300c\u770b\u5230\u300d\u4e86\u81ea\u5df1\u672a\u4f86\u53ef\u80fd\u6703\u72af\u7684\u932f\u8aa4\uff0c\u4e26\u63d0\u524d\u5f9e\u5c08\u5bb6\u90a3\u88e1\u5b78\u5230\u4e86\u88dc\u6551\u63aa\u65bd\u3002\u9019\u4f7f\u5f97\u6700\u7d42\u5b78\u5230\u7684\u7b56\u7565\u5c0d\u5176\u81ea\u8eab\u53ef\u80fd\u5f15\u8d77\u7684\u72c0\u614b\u5206\u4f48\u5177\u6709\u66f4\u5f37\u7684\u9b6f\u68d2\u6027\uff0c\u5f9e\u800c\u986f\u8457\u512a\u65bc\u55ae\u7d14\u7684\u884c\u70ba\u514b\u9686\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4.4 \u63a8\u65b7\u610f\u5716\uff1a\u9006\u5f37\u5316\u5b78\u7fd2 (IRL) \u7c21\u4ecb<\/h3>\n\n\n\n<p>\u884c\u70ba\u514b\u9686\u548c DAgger \u90fd\u5c6c\u65bc\u76f4\u63a5\u6a21\u4eff\u5c08\u5bb6<strong>\u884c\u70ba<\/strong>\uff08policy\uff09\u7684\u65b9\u6cd5\u3002\u7136\u800c\uff0c\u6a21\u4eff\u5b78\u7fd2\u9818\u57df\u9084\u5b58\u5728\u53e6\u4e00\u7a2e\u66f4\u6df1\u5c64\u6b21\u7684\u7bc4\u5f0f\uff1a<strong>\u9006\u5f37\u5316\u5b78\u7fd2\uff08Inverse Reinforcement Learning, IRL\uff09<\/strong>\u3002IRL \u7684\u76ee\u6a19\u4e0d\u662f\u6a21\u4eff\u5c08\u5bb6\u7684\u300c\u505a\u4ec0\u9ebc\u300d\uff08what\uff09\uff0c\u800c\u662f\u63a8\u65b7\u5c08\u5bb6\u300c\u70ba\u4ec0\u9ebc\u9019\u9ebc\u505a\u300d\uff08why\uff09\uff0c\u5373\u6062\u5fa9\u5c08\u5bb6\u884c\u70ba\u80cc\u5f8c\u6f5b\u5728\u7684<strong>\u734e\u52f5\u51fd\u6578<\/strong> <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>IRL \u7684\u57fa\u672c\u5047\u8a2d\u662f\uff0c\u5c08\u5bb6\u662f\u5728\u6700\u5927\u5316\u67d0\u500b\u672a\u77e5\u7684\u734e\u52f5\u51fd\u6578\u3002\u56e0\u6b64\uff0c\u900f\u904e\u89c0\u5bdf\u5c08\u5bb6\u7684\u884c\u70ba\u8ecc\u8de1\uff0c\u6211\u5011\u53ef\u4ee5\u53cd\u5411\u63a8\u65b7\u51fa\u6700\u80fd\u89e3\u91cb\u9019\u4e9b\u884c\u70ba\u7684\u734e\u52f5\u51fd\u6578\u662f\u4ec0\u9ebc <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u4e00\u500b\u5178\u578b\u7684 IRL \u6f14\u7b97\u6cd5\u6d41\u7a0b\u5927\u81f4\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li>\u96a8\u6a5f\u521d\u59cb\u5316\u4e00\u500b\u734e\u52f5\u51fd\u6578\u3002<\/li>\n\n\n\n<li>\u5728\u7576\u524d\u7684\u734e\u52f5\u51fd\u6578\u4e0b\uff0c\u4f7f\u7528\u6a19\u6e96\u7684\uff08\u6b63\u5411\uff09\u5f37\u5316\u5b78\u7fd2\u6f14\u7b97\u6cd5\uff08\u5982\u50f9\u503c\u8fed\u4ee3\u6216\u7b56\u7565\u68af\u5ea6\uff09\u6c42\u89e3\u51fa\u6700\u512a\u7b56\u7565\u3002<\/li>\n\n\n\n<li>\u6bd4\u8f03\u6c42\u89e3\u51fa\u7684\u7b56\u7565\u8207\u5c08\u5bb6\u7b56\u7565\u7684\u5dee\u7570\u3002<\/li>\n\n\n\n<li>\u6839\u64da\u5dee\u7570\uff0c\u66f4\u65b0\u734e\u52f5\u51fd\u6578\uff0c\u4f7f\u5f97\u5c08\u5bb6\u7b56\u7565\u5728\u8a72\u734e\u52f5\u51fd\u6578\u4e0b\u7684\u50f9\u503c\u66f4\u9ad8\u3002<\/li>\n\n\n\n<li>\u91cd\u8907\u6b65\u9a5f 2 \u5230 4\uff0c\u76f4\u5230\u627e\u5230\u4e00\u500b\u80fd\u8b93\u5c08\u5bb6\u7b56\u7565\u986f\u5f97\u6700\u512a\u7684\u734e\u52f5\u51fd\u6578\u3002<\/li>\n<\/ol>\n\n\n\n<p><strong>\u76f8\u5c0d\u65bc\u76f4\u63a5\u7b56\u7565\u6a21\u4eff\u7684\u512a\u52e2<\/strong> \u5b78\u7fd2\u734e\u52f5\u51fd\u6578\u901a\u5e38\u88ab\u8a8d\u70ba\u662f\u4e00\u7a2e\u6bd4\u76f4\u63a5\u6a21\u4eff\u7b56\u7565\u66f4<strong>\u7a69\u5065<\/strong>\u548c<strong>\u53ef\u6cdb\u5316<\/strong>\u7684\u65b9\u6cd5 <sup><\/sup>\uff1a &nbsp;<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u66f4\u7c21\u6f54\u7684\u4efb\u52d9\u8868\u793a<\/strong>\uff1a\u734e\u52f5\u51fd\u6578\u901a\u5e38\u662f\u6bd4\u7b56\u7565\u66f4\u7c21\u6f54\u3001\u66f4\u672c\u8cea\u7684\u4efb\u52d9\u63cf\u8ff0\u3002\u4f8b\u5982\uff0c\u300c\u907f\u514d\u78b0\u649e\u4e26\u76e1\u5feb\u5230\u9054\u76ee\u7684\u5730\u300d\u9019\u500b\u734e\u52f5\u51fd\u6578\uff0c\u6bd4\u63cf\u8ff0\u5728\u6bcf\u500b\u8def\u53e3\u3001\u6bcf\u7a2e\u8eca\u6d41\u60c5\u6cc1\u4e0b\u7684\u5177\u9ad4\u99d5\u99db\u64cd\u4f5c\u7684\u7b56\u7565\u8981\u7c21\u6f54\u5f97\u591a\u3002<\/li>\n\n\n\n<li><strong>\u66f4\u597d\u7684\u6cdb\u5316\u80fd\u529b<\/strong>\uff1a\u4e00\u500b\u5b78\u5230\u7684\u734e\u52f5\u51fd\u6578\u53ef\u4ee5\u66f4\u597d\u5730\u9077\u79fb\u5230\u52d5\u614b\u74b0\u5883\u767c\u751f\u8b8a\u5316\u7684\u65b0\u5834\u666f\u4e2d\u3002\u4f8b\u5982\uff0c\u5982\u679c\u6211\u5011\u5b78\u6703\u4e86\u81ea\u52d5\u99d5\u99db\u7684\u734e\u52f5\u51fd\u6578\uff0c\u5373\u4f7f\u5730\u5716\u6216\u4ea4\u901a\u898f\u5247\u767c\u751f\u4e86\u8f15\u5fae\u8b8a\u5316\uff0c\u6211\u5011\u4ecd\u7136\u53ef\u4ee5\u5229\u7528\u9019\u500b\u734e\u52f5\u51fd\u6578\u5728\u65b0\u74b0\u5883\u4e2d\u6c42\u89e3\u51fa\u65b0\u7684\u6700\u512a\u7b56\u7565\u3002\u800c\u4e00\u500b\u76f4\u63a5\u6a21\u4eff\u5b78\u4f86\u7684\u7b56\u7565\u5728\u74b0\u5883\u8b8a\u5316\u6642\u53ef\u80fd\u5c31\u5b8c\u5168\u5931\u6548\u4e86 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u8d85\u8d8a\u5c08\u5bb6\u7684\u6f5b\u529b<\/strong>\uff1a\u4e00\u65e6\u6211\u5011\u6062\u5fa9\u4e86\u5c08\u5bb6\u7684\u734e\u52f5\u51fd\u6578\uff0c\u6211\u5011\u5c31\u6709\u53ef\u80fd\u5229\u7528\u66f4\u5f37\u5927\u7684\u898f\u5283\u6216 RL \u6f14\u7b97\u6cd5\uff0c\u627e\u5230\u4e00\u500b\u6bd4\u5c08\u5bb6\u672c\u8eab\u66f4\u597d\u7684\u7b56\u7565\uff0c\u5f9e\u800c\u5be6\u73fe\u6027\u80fd\u4e0a\u7684\u8d85\u8d8a \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>IRL \u6355\u6349\u4e86\u5c08\u5bb6\u7684<strong>\u610f\u5716<\/strong>\uff0c\u800c\u4e0d\u50c5\u50c5\u662f\u5176\u8868\u9762\u7684\u884c\u70ba\u3002\u9019\u7a2e\u5c0d\u4efb\u52d9\u76ee\u6a19\u7684\u6df1\u5c64\u7406\u89e3\uff0c\u4f7f\u5176\u5728\u8655\u7406\u74b0\u5883\u8b8a\u5316\u548c\u8ffd\u6c42\u66f4\u9ad8\u6027\u80fd\u65b9\u9762\u5177\u6709\u7368\u7279\u7684\u512a\u52e2\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">4.5 \u5c0d\u6297\u6027\u65b9\u6cd5\uff1a\u751f\u6210\u5c0d\u6297\u6a21\u4eff\u5b78\u7fd2 (GAIL)<\/h3>\n\n\n\n<p>\u5118\u7ba1\u9006\u5f37\u5316\u5b78\u7fd2\uff08IRL\uff09\u5728\u7406\u8ad6\u4e0a\u975e\u5e38\u5f37\u5927\uff0c\u4f46\u50b3\u7d71\u7684 IRL \u65b9\u6cd5\u5728\u5be6\u8e10\u4e2d\u9762\u81e8\u4e00\u500b\u56b4\u91cd\u7684\u554f\u984c\uff1a<strong>\u8a08\u7b97\u6210\u672c\u6975\u9ad8<\/strong>\u3002\u5176\u6f14\u7b97\u6cd5\u7684\u6838\u5fc3\u5faa\u74b0\u4e2d\u901a\u5e38\u5305\u542b\u4e00\u500b\u5b8c\u6574\u7684\uff08\u6b63\u5411\uff09\u5f37\u5316\u5b78\u7fd2\u6c42\u89e3\u904e\u7a0b\uff0c\u5373\u5728\u6bcf\u6b21\u66f4\u65b0\u734e\u52f5\u51fd\u6578\u5f8c\uff0c\u90fd\u9700\u8981\u91cd\u65b0\u6c42\u89e3\u4e00\u500b\u99ac\u53ef\u592b\u6c7a\u7b56\u904e\u7a0b\uff08MDP\uff09\u3002\u9019\u500b\u300cRL in the loop\u300d\u7684\u7d50\u69cb\u4f7f\u5f97 IRL \u7684\u8a08\u7b97\u91cf\u5de8\u5927\uff0c\u96e3\u4ee5\u64f4\u5c55\u5230\u8907\u96dc\u7684\u554f\u984c\u4e0a <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>\u70ba\u4e86\u89e3\u6c7a\u9019\u500b\u554f\u984c\uff0cJonathan Ho \u548c Stefano Ermon \u63d0\u51fa\u4e86<strong>\u751f\u6210\u5c0d\u6297\u6a21\u4eff\u5b78\u7fd2\uff08Generative Adversarial Imitation Learning, GAIL\uff09<\/strong>\uff0c\u9019\u662f\u4e00\u7a2e\u5de7\u5999\u7684\u65b9\u6cd5\uff0c\u5b83\u5be6\u73fe\u4e86 IRL \u7684\u7a69\u5065\u6027\uff0c\u540c\u6642\u907f\u514d\u4e86\u5176\u9ad8\u6602\u7684\u8a08\u7b97\u6210\u672c <sup><\/sup>\u3002GAIL \u7684\u6838\u5fc3\u601d\u60f3\u662f\u5c07\u6a21\u4eff\u5b78\u7fd2\u554f\u984c\u91cd\u65b0\u6846\u5b9a\u70ba\u4e00\u500b**\u751f\u6210\u5c0d\u6297\u7db2\u8def\uff08Generative Adversarial Network, GAN\uff09**\u554f\u984c\u3002 &nbsp;<\/p>\n\n\n\n<p>\u5728 GAIL \u7684\u6846\u67b6\u4e2d\uff0c\u6a21\u4eff\u5b78\u7fd2\u88ab\u8996\u70ba\u4e00\u5834\u5169\u500b\u7db2\u8def\u4e4b\u9593\u7684\u5c0d\u6297\u904a\u6232\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u751f\u6210\u5668\uff08Generator\uff09<\/strong>\uff1a\u667a\u80fd\u9ad4\u7684<strong>\u7b56\u7565\u7db2\u8def <code>$\\pi_\\theta$<\/code><\/strong> \u626e\u6f14\u751f\u6210\u5668\u7684\u89d2\u8272\u3002\u5b83\u7684\u76ee\u6a19\u662f\u751f\u6210\u72c0\u614b-\u52d5\u4f5c\u8ecc\u8de1\uff0c\u4f7f\u5176\u770b\u8d77\u4f86\u76e1\u53ef\u80fd\u50cf\u662f\u4f86\u81ea\u5c08\u5bb6\u793a\u7bc4\u7684\u300c\u771f\u5be6\u300d\u8ecc\u8de1\uff0c\u5f9e\u800c\u300c\u6b3a\u9a19\u300d\u5224\u5225\u5668 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u5224\u5225\u5668\uff08Discriminator\uff09<\/strong>\uff1a\u4e00\u500b<strong>\u5206\u985e\u5668\u7db2\u8def <code>$D_w$<\/code><\/strong> \u626e\u6f14\u5224\u5225\u5668\u7684\u89d2\u8272\u3002\u5b83\u7684\u4efb\u52d9\u662f\u5340\u5206\u8f38\u5165\u7684\u72c0\u614b-\u52d5\u4f5c\u5c0d\u662f\u4f86\u81ea\u300c\u771f\u5be6\u300d\u7684\u5c08\u5bb6\u6578\u64da\u96c6\uff0c\u9084\u662f\u4f86\u81ea\u300c\u865b\u5047\u300d\u7684\u751f\u6210\u5668\uff08\u5373\u7576\u524d\u7684\u7b56\u7565\u7db2\u8def <code>$\\pi_\\theta$<\/code>\uff09\u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u8a13\u7df4\u904e\u7a0b\u662f\u4e00\u500b\u4ea4\u66ff\u512a\u5316\u7684minimax\u904a\u6232\uff1a<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>\u8a13\u7df4\u5224\u5225\u5668<\/strong>\uff1a\u56fa\u5b9a\u751f\u6210\u5668\uff08\u7b56\u7565\uff09\uff0c\u5f9e\u5c08\u5bb6\u6578\u64da\u548c\u7b56\u7565\u751f\u6210\u7684\u6578\u64da\u4e2d\u63a1\u6a23\uff0c\u8a13\u7df4\u5224\u5225\u5668 <code>$D_w$<\/code> \u4f86\u6700\u5927\u5316\u5176\u5206\u985e\u6e96\u78ba\u7387\u3002<\/li>\n\n\n\n<li><strong>\u8a13\u7df4\u751f\u6210\u5668<\/strong>\uff1a\u56fa\u5b9a\u5224\u5225\u5668\uff0c\u66f4\u65b0\u7b56\u7565 <code>$\\pi_\\theta$<\/code> \u7684\u53c3\u6578\uff0c\u4f7f\u5176\u751f\u6210\u7684\u8ecc\u8de1\u80fd\u5920\u6700\u5927\u7a0b\u5ea6\u5730\u300c\u8ff7\u60d1\u300d\u5224\u5225\u5668\uff0c\u5373\u8b93\u5224\u5225\u5668\u5c07\u5176\u5224\u65b7\u70ba\u300c\u771f\u5be6\u300d\u7684\u6a5f\u7387\u66f4\u9ad8\u3002<\/li>\n<\/ol>\n\n\n\n<p>GAIL \u6700\u6df1\u523b\u7684\u6d1e\u898b\u5728\u65bc\uff0c<strong>\u5224\u5225\u5668\u7684\u8f38\u51fa\u53ef\u4ee5\u88ab\u76f4\u63a5\u7528\u4f5c\u4e00\u500b\u734e\u52f5\u8a0a\u865f<\/strong>\u4f86\u6307\u5c0e\u7b56\u7565\u7684\u5b78\u7fd2\u3002\u5177\u9ad4\u4f86\u8aaa\uff0c\u7b56\u7565 <code>$\\pi_\\theta$<\/code> \u7684\u734e\u52f5\u53ef\u4ee5\u88ab\u5b9a\u7fa9\u70ba <code>$r(s,a) = -\\log(1 - D_w(s,a))$<\/code>\u3002\u7576\u7b56\u7565\u7522\u751f\u7684\u72c0\u614b-\u52d5\u4f5c\u5c0d\u8d8a\u80fd\u8ff7\u60d1\u5224\u5225\u5668\uff08\u5373 <code>$D_w(s,a)$<\/code> \u8d8a\u63a5\u8fd1 1\uff0c\u4ee3\u8868\u5224\u5225\u5668\u8a8d\u70ba\u5b83\u4f86\u81ea\u5c08\u5bb6\uff09\uff0c\u5b83\u7372\u5f97\u7684\u734e\u52f5\u5c31\u8d8a\u9ad8\u3002<\/p>\n\n\n\n<p>\u900f\u904e\u9019\u7a2e\u65b9\u5f0f\uff0c\u5224\u5225\u5668\u96b1\u5f0f\u5730\u5b78\u7fd2\u4e86\u4e00\u500b\u8207\u5c08\u5bb6\u884c\u70ba\u4e00\u81f4\u7684\u734e\u52f5\uff08\u6216\u6210\u672c\uff09\u51fd\u6578\uff0c\u800c\u7b56\u7565\u5247\u900f\u904e\u6a19\u6e96\u7684\u7b56\u7565\u68af\u5ea6\u6f14\u7b97\u6cd5\uff08\u5982 TRPO \u6216 PPO\uff09\u5728\u9019\u500b\u96b1\u5f0f\u734e\u52f5\u51fd\u6578\u4e0a\u9032\u884c\u512a\u5316\u3002\u9019\u4f7f\u5f97 GAIL \u80fd\u5920<strong>\u76f4\u63a5\u5b78\u7fd2\u4e00\u500b\u5339\u914d\u5c08\u5bb6\u72c0\u614b-\u52d5\u4f5c\u5206\u4f48\u7684\u7b56\u7565<\/strong>\uff0c\u5b8c\u5168\u7e5e\u904e\u4e86\u50b3\u7d71 IRL \u4e2d\u986f\u5f0f\u6062\u5fa9\u734e\u52f5\u51fd\u6578\u548c\u5167\u90e8 RL \u5faa\u74b0\u7684\u6602\u8cb4\u6b65\u9a5f <sup><\/sup>\u3002 &nbsp;<\/p>\n\n\n\n<p>GAIL \u4ee3\u8868\u4e86\u4e00\u7a2e\u91cd\u8981\u7684\u601d\u60f3\u8f49\u8b8a\uff0c\u5b83\u8b49\u660e\u4e86\u53ef\u4ee5\u900f\u904e\u5c0d\u6297\u6027\u8a13\u7df4\u4f86\u6709\u6548\u5730\u9032\u884c\u5206\u4f48\u5339\u914d\uff0c\u5f9e\u800c\u5be6\u73fe\u7a69\u5065\u7684\u6a21\u4eff\u5b78\u7fd2\u3002\u9019\u4e00\u6846\u67b6\u4e0d\u50c5\u5927\u5927\u63d0\u9ad8\u4e86\u6a21\u4eff\u5b78\u7fd2\u7684\u6548\u7387\u548c\u53ef\u64f4\u5c55\u6027\uff0c\u4e5f\u70ba\u5f8c\u7e8c\u8a31\u591a\u57fa\u65bc\u5c0d\u6297\u601d\u60f3\u7684 DRL \u6f14\u7b97\u6cd5\u92ea\u5e73\u4e86\u9053\u8def\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u7b2c\u4e94\u90e8\u5206\uff1a\u7d9c\u5408\u8207\u672a\u4f86\u5c55\u671b<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">5.1 \u6f14\u5316\u8ecc\u8de1\u7684\u7d71\u4e00\u8996\u89d2<\/h3>\n\n\n\n<p>\u56de\u9867\u5f9e DQN \u5230 GAIL \u7684\u6f14\u5316\u6b77\u7a0b\uff0c\u6211\u5011\u53ef\u4ee5\u770b\u5230\u4e00\u689d\u6e05\u6670\u7684\u3001\u7531\u554f\u984c\u9a45\u52d5\u5275\u65b0\u7684\u601d\u60f3\u8108\u7d61\u3002\u9019\u4e0d\u50c5\u50c5\u662f\u6f14\u7b97\u6cd5\u7684\u5806\u758a\uff0c\u800c\u662f\u5728\u89e3\u6c7a\u6839\u672c\u6027\u6311\u6230\u904e\u7a0b\u4e2d\u7684\u908f\u8f2f\u905e\u9032\u3002\u5e7e\u500b\u6838\u5fc3\u4e3b\u984c\u53cd\u8986\u51fa\u73fe\uff0c\u8cab\u7a7f\u4e86\u6574\u500b\u767c\u5c55\u904e\u7a0b\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u5f9e\u9593\u63a5\u5230\u76f4\u63a5\u7684\u512a\u5316<\/strong>\uff1a\u6f14\u5316\u7684\u8d77\u9ede DQN \u662f\u4e00\u7a2e\u9593\u63a5\u65b9\u6cd5\uff0c\u5b83\u900f\u904e\u512a\u5316\u4e00\u500b\u4ee3\u7406\u76ee\u6a19\uff08\u50f9\u503c\u51fd\u6578\uff09\u4f86\u9593\u63a5\u6539\u9032\u7b56\u7565\u3002\u7136\u800c\uff0c\u9019\u7a2e\u9593\u63a5\u6027\u5e36\u4f86\u4e86\u9023\u7e8c\u52d5\u4f5c\u7a7a\u9593\u548c\u96a8\u6a5f\u7b56\u7565\u7684\u96e3\u984c\u3002PPO \u7b49\u7b56\u7565\u68af\u5ea6\u65b9\u6cd5\u5247\u8f49\u5411\u76f4\u63a5\u512a\u5316\u6211\u5011\u6700\u7d42\u95dc\u5fc3\u7684\u76ee\u6a19\u2014\u2014\u7b56\u7565\u672c\u8eab\uff0c\u9019\u4f7f\u5f97\u554f\u984c\u7684\u8868\u8ff0\u66f4\u52a0\u81ea\u7136\u548c\u76f4\u63a5\u3002<\/li>\n\n\n\n<li><strong>\u7a69\u5b9a\u6027\u8207\u6548\u7387\u7684\u6301\u7e8c\u535a\u5f08<\/strong>\uff1a\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u6b77\u53f2\u5728\u5f88\u5927\u7a0b\u5ea6\u4e0a\u662f\u4e00\u90e8\u95dc\u65bc\u5982\u4f55\u99b4\u670d\u4e0d\u7a69\u5b9a\u6027\u7684\u6b77\u53f2\u3002DQN \u7684\u7d93\u9a57\u91cd\u64ad\u548c\u76ee\u6a19\u7db2\u8def\u662f\u70ba\u4e86\u7a69\u5b9a\u300c\u51fd\u6578\u903c\u8fd1+\u81ea\u8209\u300d\u7684\u7d44\u5408\uff1b\u884c\u52d5\u8005-\u8a55\u8ad6\u5bb6\u6846\u67b6\u662f\u70ba\u4e86\u7a69\u5b9a\u9ad8\u8b8a\u7570\u6578\u7684\u7b56\u7565\u68af\u5ea6\uff1bPPO \u5247\u662f\u70ba\u4e86\u7a69\u5b9a\u7b56\u7565\u66f4\u65b0\u7684\u6b65\u9577\u3002\u6bcf\u4e00\u6b21\u7a69\u5b9a\u6027\u7684\u63d0\u5347\uff0c\u90fd\u4f34\u96a8\u8457\u5c0d\u6a23\u672c\u6548\u7387\u7684\u8003\u91cf\uff0c\u4f8b\u5982 PER \u900f\u904e\u667a\u80fd\u63a1\u6a23\u63d0\u5347\u6548\u7387\uff0cPPO \u900f\u904e\u591a\u8f2a\u66f4\u65b0\u63d0\u5347\u6548\u7387\u3002<\/li>\n\n\n\n<li><strong>\u8907\u96dc\u554f\u984c\u7684\u89e3\u8026<\/strong>\uff1a\u4e00\u500b\u5f37\u5927\u7684\u5de5\u7a0b\u548c\u79d1\u5b78\u539f\u5247\u662f\u5c07\u4e00\u500b\u8907\u96dc\u7684\u5927\u554f\u984c\u5206\u89e3\u70ba\u591a\u500b\u66f4\u7c21\u55ae\u7684\u5b50\u554f\u984c\u3002World Models \u5c07\u611f\u77e5\/\u9810\u6e2c\u554f\u984c\uff08\u7531\u5927\u578b V \u548c M \u6a21\u578b\u8655\u7406\uff09\u8207\u63a7\u5236\u554f\u984c\uff08\u7531\u5c0f\u578b C \u6a21\u578b\u8655\u7406\uff09\u5fb9\u5e95\u89e3\u8026\u3002\u540c\u6a23\uff0cGAIL \u5c07\u6a21\u4eff\u5b78\u7fd2\u554f\u984c\u89e3\u8026\u70ba\u5206\u4f48\u5339\u914d\uff08\u7531\u5224\u5225\u5668\u8a55\u4f30\uff09\u548c\u7b56\u7565\u641c\u7d22\uff08\u7531\u751f\u6210\u5668\u57f7\u884c\uff09\uff0c\u907f\u514d\u4e86\u50b3\u7d71 IRL \u4e2d\u7cfe\u7e8f\u5728\u4e00\u8d77\u7684\u734e\u52f5\u8a2d\u8a08\u548c\u7b56\u7565\u5b78\u7fd2\u3002<\/li>\n\n\n\n<li><strong>\u5c0d\u6578\u64da\u5206\u4f48\u7684\u65e5\u76ca\u7cbe\u7d30\u7684\u8655\u7406<\/strong>\uff1a\u6f14\u7b97\u6cd5\u5982\u4f55\u5c0d\u5f85\u548c\u5229\u7528\u6578\u64da\uff0c\u662f\u53e6\u4e00\u500b\u95dc\u9375\u7684\u6f14\u5316\u7dad\u5ea6\u3002DQN \u7684\u7d93\u9a57\u91cd\u64ad\u900f\u904e\u96a8\u6a5f\u5316\u4f86\u6253\u7834\u6578\u64da\u76f8\u95dc\u6027\uff1bDAgger \u5247\u66f4\u9032\u4e00\u6b65\uff0c\u900f\u904e\u8207\u5c08\u5bb6\u7684\u4e92\u52d5\u4f86\u4e3b\u52d5\u4fee\u6b63\u8a13\u7df4\u6578\u64da\u7684\u5206\u4f48\uff0c\u4ee5\u89e3\u6c7a\u5171\u8b8a\u6578\u504f\u79fb\uff1bGAIL \u5247\u5c07\u5206\u4f48\u8655\u7406\u63d0\u5347\u5230\u4e00\u500b\u65b0\u7684\u9ad8\u5ea6\uff0c\u5b83\u4e0d\u8a66\u5716\u4fee\u6b63\u6578\u64da\uff0c\u800c\u662f\u76f4\u63a5\u8a2d\u5b9a\u76ee\u6a19\u70ba\u5339\u914d\u5c08\u5bb6\u6578\u64da\u7684\u6f5b\u5728\u5206\u4f48\uff0c\u4e26\u5229\u7528\u5c0d\u6297\u6027\u8a13\u7df4\u4f5c\u70ba\u5f37\u5927\u7684\u5de5\u5177\u4f86\u5be6\u73fe\u9019\u4e00\u76ee\u6a19\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u9019\u689d\u6f14\u5316\u8def\u5f91\u5c55\u793a\u4e86\u8a72\u9818\u57df\u5f9e\u6700\u521d\u7684\u300c\u53ef\u884c\u6027\u8b49\u660e\u300d\uff08DQN\uff09\u767c\u5c55\u5230\u8ffd\u6c42\u300c\u7a69\u5b9a\u8207\u9ad8\u6548\u300d\uff08PPO\uff09\uff0c\u518d\u5230\u63a2\u7d22\u300c\u66f4\u9ad8\u5c64\u6b21\u7684\u8a8d\u77e5\u80fd\u529b\u300d\uff08World Models \u7684\u60f3\u50cf\u8207\u898f\u5283\uff09\u548c\u300c\u66f4\u9748\u6d3b\u7684\u5b78\u7fd2\u4f86\u6e90\u300d\uff08Imitation Learning \u7684\u5c08\u5bb6\u793a\u7bc4\uff09\u7684\u6210\u719f\u904e\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">5.2 \u65b0\u8208\u8da8\u52e2\u8207\u958b\u653e\u6027\u6311\u6230<\/h3>\n\n\n\n<p>\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u9818\u57df\u4ecd\u5728\u9ad8\u901f\u767c\u5c55\uff0c\u7576\u524d\u7684\u7814\u7a76\u524d\u6cbf\u6b63\u81f4\u529b\u65bc\u878d\u5408\u4e0a\u8ff0\u6f14\u5316\u8def\u5f91\u4e2d\u7684\u5404\u7a2e\u601d\u60f3\uff0c\u4e26\u61c9\u5c0d\u5c07\u9019\u4e9b\u5f37\u5927\u6f14\u7b97\u6cd5\u61c9\u7528\u65bc\u73fe\u5be6\u4e16\u754c\u6240\u9762\u81e8\u7684\u6839\u672c\u6027\u6311\u6230\u3002<\/p>\n\n\n\n<p><strong>\u65b0\u8208\u8da8\u52e2\uff1a<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u7bc4\u5f0f\u878d\u5408<\/strong>\uff1a\u7814\u7a76\u4eba\u54e1\u6b63\u7a4d\u6975\u63a2\u7d22\u5c07\u4e0d\u540c\u7bc4\u5f0f\u7d50\u5408\u8d77\u4f86\u3002\u4f8b\u5982\uff0c\u5c07\u6a21\u578b\u57fa\u790e RL \u8207\u6a21\u4eff\u5b78\u7fd2\u7d50\u5408\uff0c\u667a\u80fd\u9ad4\u53ef\u4ee5\u5148\u5f9e\u5c08\u5bb6\u793a\u7bc4\u4e2d\u5b78\u7fd2\u4e00\u500b\u521d\u6b65\u7684\u4e16\u754c\u6a21\u578b\u548c\u7b56\u7565\uff0c\u7136\u5f8c\u5728\u81ea\u5df1\u7684\u300c\u5922\u5883\u300d\u4e2d\u9032\u884c\u5fae\u8abf\u548c\u6539\u9032\uff0c\u9019\u6709\u671b\u540c\u6642\u63d0\u5347\u6a23\u672c\u6548\u7387\u548c\u6027\u80fd\u3002<\/li>\n\n\n\n<li><strong>\u57fa\u790e\u6a21\u578b\u7684\u5f71\u97ff<\/strong>\uff1a\u5927\u578b\u8a9e\u8a00\u6a21\u578b\uff08LLM\uff09\u548c\u8996\u89ba\u57fa\u790e\u6a21\u578b\uff08VFM\uff09\u7684\u6210\u529f\uff0c\u70ba DRL \u63d0\u4f9b\u4e86\u65b0\u7684\u53ef\u80fd\u6027\u3002\u9019\u4e9b\u9810\u8a13\u7df4\u6a21\u578b\u53ef\u4ee5\u4f5c\u70ba\u5f37\u5927\u7684\u4e16\u754c\u6a21\u578b\u6216\u8868\u5fb5\u63d0\u53d6\u5668\uff0c\u70ba RL \u667a\u80fd\u9ad4\u63d0\u4f9b\u8c50\u5bcc\u7684\u5148\u9a57\u77e5\u8b58\uff0c\u5f9e\u800c\u6975\u5927\u5730\u6e1b\u5c11\u5f9e\u96f6\u958b\u59cb\u5b78\u7fd2\u6240\u9700\u7684\u6578\u64da\u91cf\u3002<\/li>\n\n\n\n<li><strong>\u96e2\u7dda\u5f37\u5316\u5b78\u7fd2 (Offline RL)<\/strong>\uff1a\u5728\u8a31\u591a\u73fe\u5be6\u5834\u666f\u4e2d\uff0c\u667a\u80fd\u9ad4\u7121\u6cd5\u81ea\u7531\u5730\u8207\u74b0\u5883\u4e92\u52d5\u63a2\u7d22\uff0c\u53ea\u80fd\u5f9e\u4e00\u500b\u56fa\u5b9a\u7684\u3001\u9810\u5148\u6536\u96c6\u597d\u7684\u6578\u64da\u96c6\u4e2d\u5b78\u7fd2\u3002\u96e2\u7dda RL \u65e8\u5728\u89e3\u6c7a\u9019\u4e00\u6311\u6230\uff0c\u5b83\u9700\u8981\u958b\u767c\u51fa\u80fd\u5920\u5728\u4e0d\u7522\u751f\u5206\u4f48\u5916\uff08out-of-distribution\uff09\u52d5\u4f5c\u7684\u60c5\u6cc1\u4e0b\uff0c\u5b89\u5168\u5730\u5f9e\u975c\u614b\u6578\u64da\u4e2d\u63d0\u53d6\u6709\u6548\u7b56\u7565\u7684\u6f14\u7b97\u6cd5\u3002<\/li>\n<\/ul>\n\n\n\n<p><strong>\u958b\u653e\u6027\u6311\u6230\uff1a<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>\u6cdb\u5316\u8207\u9069\u61c9\u6027<\/strong>\uff1a\u5118\u7ba1 DRL \u5728\u7279\u5b9a\u4efb\u52d9\u4e0a\u53d6\u5f97\u4e86\u6210\u529f\uff0c\u4f46\u8b93\u667a\u80fd\u9ad4\u5c07\u5b78\u5230\u7684\u6280\u80fd\u6cdb\u5316\u5230\u4e00\u500b\u8207\u8a13\u7df4\u74b0\u5883\u6709\u7d30\u5fae\u5dee\u5225\u7684\u65b0\u74b0\u5883\u4e2d\uff0c\u4ecd\u7136\u662f\u4e00\u500b\u5de8\u5927\u7684\u6311\u6230\u3002\u5982\u4f55\u5b78\u7fd2\u5230\u771f\u6b63\u9b6f\u68d2\u4e14\u53ef\u9077\u79fb\u7684\u7b56\u7565\u662f\u8a72\u9818\u57df\u7684\u6838\u5fc3\u554f\u984c\u4e4b\u4e00 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u6a23\u672c\u6548\u7387<\/strong>\uff1a\u5118\u7ba1\u6a21\u578b\u57fa\u790e\u65b9\u6cd5\u6709\u6240\u6539\u5584\uff0c\u4f46\u5c0d\u65bc\u5927\u591a\u6578 DRL \u6f14\u7b97\u6cd5\u800c\u8a00\uff0c\u9054\u5230\u9ad8\u6027\u80fd\u6240\u9700\u7684\u4e92\u52d5\u6a23\u672c\u6578\u91cf\u4ecd\u7136\u662f\u9a5a\u4eba\u7684\u3002\u5c0d\u65bc\u73fe\u5be6\u4e16\u754c\u7684\u61c9\u7528\uff08\u5982\u6a5f\u5668\u4eba\uff09\uff0c\u9019\u4f9d\u7136\u662f\u4e00\u500b\u4e3b\u8981\u7684\u74f6\u9838 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u5b89\u5168\u8207\u53ef\u9760\u6027<\/strong>\uff1a\u5728\u5c07 DRL \u61c9\u7528\u65bc\u81ea\u52d5\u99d5\u99db\u3001\u91ab\u7642\u6216\u91d1\u878d\u7b49\u9ad8\u98a8\u96aa\u9818\u57df\u4e4b\u524d\uff0c\u5fc5\u9808\u78ba\u4fdd\u667a\u80fd\u9ad4\u7684\u884c\u70ba\u662f\u5b89\u5168\u3001\u53ef\u9810\u6e2c\u548c\u53ef\u9760\u7684\u3002\u5982\u4f55\u70ba\u8907\u96dc\u7684\u6df1\u5ea6\u5b78\u7fd2\u7b56\u7565\u63d0\u4f9b\u884c\u70ba\u4fdd\u8b49\uff0c\u4ee5\u53ca\u5982\u4f55\u907f\u514d\u707d\u96e3\u6027\u7684\u5931\u6557\uff0c\u662f\u81f3\u95dc\u91cd\u8981\u7684\u7814\u7a76\u65b9\u5411 \u3002 \u00a0<\/li>\n\n\n\n<li><strong>\u734e\u52f5\u8a2d\u8a08<\/strong>\uff1a\u5118\u7ba1\u6a21\u4eff\u5b78\u7fd2\u63d0\u4f9b\u4e86\u4e00\u689d\u7e5e\u904e\u734e\u52f5\u8a2d\u8a08\u7684\u9014\u5f91\uff0c\u4f46\u5728\u8a31\u591a\u9700\u8981\u63a2\u7d22\u548c\u5275\u65b0\u7684\u4efb\u52d9\u4e2d\uff0c\u734e\u52f5\u4ecd\u7136\u662f\u5fc5\u8981\u7684\u3002\u5982\u4f55\u8a2d\u8a08\u80fd\u5920\u6e96\u78ba\u5f15\u5c0e\u8907\u96dc\u884c\u70ba\u4e14\u4e0d\u6613\u88ab\u300c\u99ed\u5ba2\u300d\u7684\u734e\u52f5\u51fd\u6578\uff0c\u4ecd\u7136\u662f\u4e00\u9580\u85dd\u8853\u800c\u975e\u79d1\u5b78 \u3002 \u00a0<\/li>\n<\/ul>\n\n\n\n<p>\u7e3d\u800c\u8a00\u4e4b\uff0c\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u6f14\u5316\u4e4b\u65c5\u9060\u672a\u7d50\u675f\u3002\u5f9e\u50f9\u503c\u51fd\u6578\u7684\u7a69\u5b9a\u4f30\u8a08\uff0c\u5230\u7b56\u7565\u7684\u76f4\u63a5\u5b89\u5168\u512a\u5316\uff0c\u518d\u5230\u5c0d\u4e16\u754c\u6a21\u578b\u7684\u5167\u90e8\u69cb\u5efa\u548c\u5c0d\u5c08\u5bb6\u610f\u5716\u7684\u6df1\u523b\u7406\u89e3\uff0c\u9019\u689d\u8def\u5f91\u5c55\u793a\u4e86\u4eba\u5de5\u667a\u6167\u5728\u8ffd\u6c42\u901a\u7528\u6c7a\u7b56\u80fd\u529b\u65b9\u9762\u7684\u6301\u7e8c\u63a2\u7d22\u3002\u672a\u4f86\u7684\u7a81\u7834\u5c07\u53ef\u80fd\u4f86\u81ea\u65bc\u5c0d\u9019\u4e9b\u4e0d\u540c\u601d\u60f3\u7684\u66f4\u6df1\u5c64\u6b21\u7684\u878d\u5408\uff0c\u4ee5\u53ca\u5c0d\u89e3\u6c7a\u73fe\u5be6\u4e16\u754c\u61c9\u7528\u4e2d\u6839\u672c\u6027\u6311\u6230\u7684\u4e0d\u61c8\u52aa\u529b\u3002<\/p>\n\n\n\n<p><strong>\u660e\u5929\u5c07\u6703\u63a8\u51fa\u4e92\u52d5\u5f0f\u7db2\u7ad9\u656c\u8acb\u671f\u5f85<\/strong><\/p>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\u7684\u6f14\u5316\uff1a\u5f9e\u50f9\u503c\u51fd\u6578\u5230\u4e16\u754c\u6a21\u578b\u8207\u6a21\u4eff\u5b78\u7fd2 \u5c0e\u8ad6 \u6df1\u5ea6\u5f37\u5316\u5b78\u7fd2\uff08Deep &hellip; <a href=\"https:\/\/ouyangminwei.com\/index.php\/2025\/07\/31\/dqn\/\">\u95b1\u8b80\u5168\u6587 <span class=\"meta-nav\">&rarr;<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[1],"tags":[],"post_format":[],"class_list":["post-880","post","type-post","status-publish","format-standard","hentry","category-uncategorized"],"_edit_lock":"1752832532:1","_edit_last":"1","_aioseo_title":"#post_title #separator_sa #site_title","_aioseo_description":"#post_excerpt","_aioseo_keywords":"","_aioseo_og_title":"","_aioseo_og_description":"","_aioseo_og_article_section":"","_aioseo_og_article_tags":"","_aioseo_twitter_title":"","_aioseo_twitter_description":"","_oembed_2544c1d0cb3503ab4c4d558c3b3c8873":"","_oembed_time_2544c1d0cb3503ab4c4d558c3b3c8873":"","_oembed_99481806ecbe6ce4ee46f8588d320993":"","_oembed_db663acf973e82e6d9d80df71945dfb8":"","_oembed_16cdfab488f57db73586f4286af2704f":"","_wp_old_slug":"","_links":{"self":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/880","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/comments?post=880"}],"version-history":[{"count":2,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/880\/revisions"}],"predecessor-version":[{"id":883,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/posts\/880\/revisions\/883"}],"wp:attachment":[{"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/media?parent=880"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/categories?post=880"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/tags?post=880"},{"taxonomy":"post_format","embeddable":true,"href":"https:\/\/ouyangminwei.com\/index.php\/wp-json\/wp\/v2\/post_format?post=880"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}