{"created":"2023-06-26T11:00:55.777889+00:00","id":1526,"links":{},"metadata":{"_buckets":{"deposit":"74f7123b-3e12-4f14-bba4-7e267eb354f2"},"_deposit":{"created_by":22,"id":"1526","owners":[22],"pid":{"revision_id":0,"type":"depid","value":"1526"},"status":"published"},"_oai":{"id":"oai:oist.repo.nii.ac.jp:00001526","sets":["7:200"]},"author_link":[],"item_10006_creator_2":{"attribute_name":"Author","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"小津野, 将","creatorNameLang":"ja"}]}]},"item_10006_creator_3":{"attribute_name":"Author","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Kozuno, Tadashi","creatorNameLang":"en"}]}]},"item_10006_date_granted_11":{"attribute_name":"Degree Conferral Date","attribute_value_mlt":[{"subitem_dategranted":"2020-03-31"}]},"item_10006_degree_grantor_9":{"attribute_name":"Degree Conferrral Institution","attribute_value_mlt":[{"subitem_degreegrantor":[{"subitem_degreegrantor_name":"Okinawa Institute of Science and Technology Graduate University"}],"subitem_degreegrantor_identifier":[{"subitem_degreegrantor_identifier_name":"38005","subitem_degreegrantor_identifier_scheme":"kakenhi"}]}]},"item_10006_degree_name_8":{"attribute_name":"Degree","attribute_value_mlt":[{"subitem_degreename":"Doctor of Philosophy"}]},"item_10006_description_7":{"attribute_name":"Abstract","attribute_value_mlt":[{"subitem_description":"Model-free deep Reinforcement Learning (RL) algorithms, a combination of deep learning and model-free RL algorithms, have attained remarkable successes in solving complex tasks such as video games. However, theoretical analyses and recent empirical results indicate its proneness to various types of value update errors including but not limited to estimation error of updates due to finite samples and function approximation error. Because real-world tasks are inherently complex and stochastic, such errors are inevitable, and thus, the development of error-tolerant RL algorithms are of great importance for applications of RL to real problems. To this end, I propose two error-tolerant algorithms for RL called Conservative Value Iteration (CVI) and Gap-increasing RetrAce for Policy Evaluation (GRAPE). \n CVI unifies value-iteration-like single-stage-lookahead algorithms such as soft value iteration, advantage learning and Ψ-learning, all of which are characterized by the use of a gap-increasing operator and/or softmax operator in value updates. We provide detailed theoretical analysis of CVI that not only shows CVI's advantages but also contributes to the theory of RL in the following two points: First, it elucidates pros and cons of gap-increasing and softmax operators. Second, it provides an actual example in which performance of algorithms with max operator is worse than that of algorithms with softmax operator demonstrating the limitation of traditional greedy value updates. \n GRAPE is a policy evaluation algorithm extending advantage learning (AL) and retrace, both of which have different advantages: AL is noise-tolerant as shown through our theoretical analysis of CVI, while retrace is efficient in that it is off-policy and allows the control of bias-variance trade-off. Theoretical analys is of GRAPE shows that it enjoys the merits of both algorithms. In experiments, we demonstrate the benefit of GRAPE combined with a variant of trust region policy optimization and its superiority to previous algorithms. \n Through these studies, I theoretically elucidated the benefits of gap-increasing and softmax operators in both policy evaluation and control settings. While some open problems remain as explained in the final chapter, the results presented in this thesis are an important step towards a deep understanding of RL algorithms.","subitem_description_language":"en","subitem_description_type":"Other"}]},"item_10006_dissertation_number_12":{"attribute_name":"Degree Referral Number","attribute_value_mlt":[{"subitem_dissertationnumber":"甲第49号"}]},"item_10006_identifier_registration":{"attribute_name":"ID登録","attribute_value_mlt":[{"subitem_identifier_reg_text":"10.15102/1394.00001389","subitem_identifier_reg_type":"JaLC"}]},"item_10006_rights_13":{"attribute_name":"Copyright Information","attribute_value_mlt":[{"subitem_rights":"© 2020 The Author."}]},"item_10006_text_24":{"attribute_name":"Exam Date","attribute_value_mlt":[{"subitem_text_value":"2020-01-20"}]},"item_10006_version_type_18":{"attribute_name":"Version Format","attribute_value_mlt":[{"subitem_version_resource":"http://purl.org/coar/version/c_970fb48d4fbd8a85","subitem_version_type":"VoR"}]},"item_access_right":{"attribute_name":"アクセス権","attribute_value_mlt":[{"subitem_access_right":"open access","subitem_access_right_uri":"http://purl.org/coar/access_right/c_abf2"}]},"item_files":{"attribute_name":"ファイル情報","attribute_type":"file","attribute_value_mlt":[{"accessrole":"open_date","date":[{"dateType":"Available","dateValue":"2020-05-26"}],"displaytype":"detail","filename":"Full text.pdf","filesize":[{"value":"2.0 MB"}],"format":"application/pdf","licensefree":"Creative Commons Attribution 4.0 International(https://creativecommons.org/licenses/by/4.0/)","licensetype":"license_note","mimetype":"application/pdf","url":{"label":"Full text","objectType":"fulltext","url":"https://oist.repo.nii.ac.jp/record/1526/files/Full text.pdf"},"version_id":"3f9d2ad3-fe68-4f04-8143-df3e30e33ef0"},{"accessrole":"open_date","date":[{"dateType":"Available","dateValue":"2020-05-26"}],"displaytype":"detail","filename":"Final Exam Abstract.pdf","filesize":[{"value":"42.9 kB"}],"format":"application/pdf","licensetype":"license_note","mimetype":"application/pdf","url":{"label":"Final Exam Abstract","objectType":"abstract","url":"https://oist.repo.nii.ac.jp/record/1526/files/Final Exam Abstract.pdf"},"version_id":"c33a1287-1305-4578-9ed5-ecc719f4311e"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourcetype":"doctoral thesis","resourceuri":"http://purl.org/coar/resource_type/c_db06"}]},"item_title":"ギャップ増大作用素とソフトマックス作用素の理論解析を通した高学習効率かつノイズ頑健な強化学習アルゴリズム","item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"ギャップ増大作用素とソフトマックス作用素の理論解析を通した高学習効率かつノイズ頑健な強化学習アルゴリズム","subitem_title_language":"ja"},{"subitem_title":"Efficient and Noise-Tolerant Reinforcement Learning Algorithms via Theoretical Analysis of Gap-Increasing and Softmax Operators","subitem_title_language":"en"}]},"item_type_id":"10006","owner":"22","path":["200"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2020-05-26"},"publish_date":"2020-05-26","publish_status":"0","recid":"1526","relation_version_is_last":true,"title":["ギャップ増大作用素とソフトマックス作用素の理論解析を通した高学習効率かつノイズ頑健な強化学習アルゴリズム"],"weko_creator_id":"22","weko_shared_id":-1},"updated":"2023-08-31T02:52:25.867052+00:00"}