This research aims to build an acoustic scene analysis infrastructure that operates with high performance while taking into account the possibility of human intervention. Specifically, we aim to create a methodology for sound source separation with high separation performance and the possibility of human intervention, deep analysis synthesis, by combining deep acoustic synthesis (a technology combining synthesizers established in signal processing and deep learning) and deep sound source separation (sound source separation using deep learning) technologies. By applying this technology, it is possible to realize an acoustic scene analysis method that can be adapted to various situations that include elements that are difficult to foresee in advance due to human intervention, rather than aiming for practical use by devising learning only. This should make it possible to actively introduce human a priori and expert knowledge.
@inproceedings{hyodo24slt_chorus,abbr_publisher={Proceedings of IEEE Spoken Language Technology Workshop (SLT)},booktitle={Proceedings of IEEE Spoken Language Technology Workshop (SLT)},title={DNN-based ensemble singing voice synthesis with interactions between singers},author={Hyodo, Hiroaki and Takamichi, Shinnosuke and Nakamura, Tomohiro and Koguchi, Junya and Saruwatari, Hiroshi},year={2024}}
@inproceedings{suda24interspeech_sukikirai,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={Who Finds This Voice Attractive? A Large-Scale Experiment Using In-the-Wild Data},author={Suda, Hitoshi and Watanabe, Aya and Takamichi, Shinnosuke},year={2024},memo={This work was supported by JSPS KAKENHI Grant Number 23K20017, 21H04900, 22H03639, and 23H03418, and JST FOREST JPMJFR226V. This paper is based on results obtained from a project, JPNP20006, commissioned by the New Energy and Industrial Technology Development Organization (NEDO).}}
@inproceedings{take24dafx_effect-chain,abbr_publisher={Proceedings of International Conference on Digital Audio Effects (DAFx)},booktitle={Proceedings of International Conference on Digital Audio Effects (DAFx)},title={Audio Effect Chain Estimation and Dry Signal Recovery from Multi-Effect-Processed Musical Signals},author={Take, Osamu and Watanabe, Kento and Nakatsuka, Takayuki and Cheng, Tian and Nakano, Tomoyasu and Goto, Masataka and Takamichi, Shinnosuke and Saruwatari, Hiroshi},memo={This work is supported by JSPS KAKENHI 21H04900, 22H03639, and 23H03418, JST FOREST JPMJFR226V, and Moonshot R&D Grant Number JPMJPS2011.},year={2024}}
@inproceedings{seki24interspeech_spatial-voice-conversion,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={Spatial Voice Conversion: Voice Conversion Preserving Spatial Information and Non-target Signals},author={Seki, Kentaro and Takamichi, Shinnosuke and Takamune, Norihiro and Saito, Yuki and Imamura, Kanami and Saruwatari, Hiroshi},year={2024},memo={This work is supported by Research Grant S of the Tateishi Science and Technology Foundation.}}
@inproceedings{watanabe24asjs_coconut-embedding,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={対照学習モデルによる音声-声質表現文の埋め込み表現獲得},author={亞椰, 渡邊 and 慎之介, 高道 and 佑樹, 齋藤 and 亘, 中田 and 徳泰, 辛 and 洋, 猿渡},year={2024},memo={本研究は科研費 21H04900, 22H03639,23H03418,JST 創発的研究支援事業 JP23KJ0828,ムーンショット JPMJPS2011 の助成を受けたものです.}}
@article{saeki24access_selfremaster,title={{SelfRemaster}: Self-Supervised Speech Restoration for Historical Audio Resources},author={Saeki, Takaaki and Takamichi, Shinnosuke and Nakamura, Tomohiko and Tanji, Naoko and Saruwatari, Hiroshi},year={2024},journal={IEEE Access},}
@inproceedings{take24asjs_audio-effect,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={複数のオーディオエフェクトが適用された楽音に対するエフェクトチェイン推定と原音復元},author={伯寒, 武 and 研斗, 渡邉 and 貴之, 中塚 and Cheng, Tian and 倫靖, 中野 and 真孝, 後藤 and 慎之介, 高道 and 洋, 猿渡},year={2024},memo={本研究は科研費 21H04900, 22H03639,23H03418, JST 創発的研究支援事業 JP23KJ0828,ムーンショット JPMJPS2011 の助成を受けたものです}}
@inproceedings{hyodo24asja_duet-timing,abbr_publisher={日本音響学会秋季研究発表会},booktitle={日本音響学会秋季研究発表会},title={二重唱の歌い出しタイミングに対する同時性知覚の刺激閾調査},author={弘明, 兵藤 and 慎之介, 高道 and 洋, 猿渡},year={2024}}
@inproceedings{watanabe23asru_coconut-corpus,abbr_publisher={IEEE Automatic Speech Recogiton and Understanding Workshop (ASRU)},booktitle={IEEE Automatic Speech Recogiton and Understanding Workshop (ASRU)},title={{Coco-Nut}: Corpus of {J}apanese Utterance and Voice Characteristics Description for Prompt-based Control},author={Watanabe, Aya and Takamichi, Shinnosuke and Saito, Yuki and Nakata, Wataru and Xin, Detai and Saruwatari, Hiroshi},year={2023}}
@inproceedings{ueda23interspeech_humandiffusion,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={HumanDiffusion: diffusion model using perceptual gradients},author={Ueda, Yota and Takamichi, Shinnosuke and Saito, Yuki and Takamune, Norihiro and Saruwatari, Hiroshi},year={2023}}
@inproceedings{watanabe23asja_coconut,abbr_publisher={日本音響学会秋季研究発表会},booktitle={日本音響学会秋季研究発表会},title={Coco-Nut: 自由記述文による声質制御に向けた多話者音声・声質自由記述ペアデータセット},author={亞椰, 渡邊 and 慎之介, 高道 and 佑樹, 齋藤 and 徳泰, 辛 and 洋, 猿渡},year={2023}}
@inproceedings{maeda23asja_zipf,abbr_publisher={日本音響学会秋季研究発表会},booktitle={日本音響学会秋季研究発表会},title={深層学習で獲得される音声シンボルは自然言語シンボルと同様に {Zipf} 則に従うか?},author={紘希, 前田 and 慎之介, 高道 and 浚鎔, 朴 and 洋, 猿渡},year={2023}}
@inproceedings{park23interspeech_gslm,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={How Generative Spoken Language Model Encodes Noisy Speech: Investigation from Phonetics to Syntactics},author={Park, Joonyong and Takamichi, Shinnosuke and Nakamura, Tomohiko and Seki, Kentaro and Xin, Detai and Saruwatari, Hiroshi},year={2023}}