State-of-the-art text-to-speech synthesis is capable of synthesizing speech that is nearly equivalent to that of humans, and is already in use in smart speakers. On the other hand, technologies that enable flexible processing of the same text according to the situation are in demand mainly in the area of content production. As a new direction in speech synthesis research, this research aims to develop technology that allows a computer to collaborate with a user as if it were a craftsman and assist the user’s speech design. In collaboration with the field of psychology, we will identify psychological parameters for voice control suitable for text-to-speech synthesis and voice quality conversion. The aim is to systematize a series of techniques for fine-tuning the user’s rough design and synthesizing natural sound as “Bespoke Speech Design”.
@inproceedings{suda24interspeech_sukikirai,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={Who Finds This Voice Attractive? A Large-Scale Experiment Using In-the-Wild Data},author={Suda, Hitoshi and Watanabe, Aya and Takamichi, Shinnosuke},year={2024},memo={This work was supported by JSPS KAKENHI Grant Number 23K20017, 21H04900, 22H03639, and 23H03418, and JST FOREST JPMJFR226V. This paper is based on results obtained from a project, JPNP20006, commissioned by the New Energy and Industrial Technology Development Organization (NEDO).}}
@inproceedings{ohnaka24asjs_vtts-width,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={{F0}に基づいて伸縮された画像文字からの音声合成},author={緋慧, 大中 and 亮一, 宮崎 and 慎之介, 高道},year={2024},memo={本研究は,科研費 22H03639,21H04900 による補助を受けた}}
@inproceedings{watanabe24asjs_coconut-embedding,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={対照学習モデルによる音声-声質表現文の埋め込み表現獲得},author={亞椰, 渡邊 and 慎之介, 高道 and 佑樹, 齋藤 and 亘, 中田 and 徳泰, 辛 and 洋, 猿渡},year={2024},memo={本研究は科研費 21H04900, 22H03639,23H03418,JST 創発的研究支援事業 JP23KJ0828,ムーンショット JPMJPS2011 の助成を受けたものです.}}
@inproceedings{meada24icassp_zipf-law,abbr_publisher={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},title={Do learned speech symbols follow {Z}ipf's law?},author={Takamichi, Shinnosuke and Maeda, Hiroki and Park, Joonyong and Saito, Daisuke and Saruwatari, Hiroshi},year={2024}}
@article{luo24apsipa-trans_emotion-synthesis,title={Emotion-controllable Speech Synthesis using Emotion Soft Label, Utterance-level Prosodic Factors, and Word-level Prominence},author={Luo, Xuan and Takamichi, Shinnosuke and Saito, Yuki and Koriyama, Tomoki and Saruwatari, Hiroshi},year={2024},journal={APSIPA Transactions},}
@inproceedings{take24asjs_audio-effect,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={複数のオーディオエフェクトが適用された楽音に対するエフェクトチェイン推定と原音復元},author={伯寒, 武 and 研斗, 渡邉 and 貴之, 中塚 and Cheng, Tian and 倫靖, 中野 and 真孝, 後藤 and 慎之介, 高道 and 洋, 猿渡},year={2024},memo={本研究は科研費 21H04900, 22H03639,23H03418, JST 創発的研究支援事業 JP23KJ0828,ムーンショット JPMJPS2011 の助成を受けたものです}}
@inproceedings{take24dafx_effect-chain,abbr_publisher={Proceedings of International Conference on Digital Audio Effects (DAFx)},booktitle={Proceedings of International Conference on Digital Audio Effects (DAFx)},title={Audio Effect Chain Estimation and Dry Signal Recovery from Multi-Effect-Processed Musical Signals},author={Take, Osamu and Watanabe, Kento and Nakatsuka, Takayuki and Cheng, Tian and Nakano, Tomoyasu and Goto, Masataka and Takamichi, Shinnosuke and Saruwatari, Hiroshi},memo={This work is supported by JSPS KAKENHI 21H04900, 22H03639, and 23H03418, JST FOREST JPMJFR226V, and Moonshot R&D Grant Number JPMJPS2011.},year={2024}}
@inproceedings{watanabe23asru_coconut-corpus,abbr_publisher={IEEE Automatic Speech Recogiton and Understanding Workshop (ASRU)},booktitle={IEEE Automatic Speech Recogiton and Understanding Workshop (ASRU)},title={{Coco-Nut}: Corpus of {J}apanese Utterance and Voice Characteristics Description for Prompt-based Control},author={Watanabe, Aya and Takamichi, Shinnosuke and Saito, Yuki and Nakata, Wataru and Xin, Detai and Saruwatari, Hiroshi},year={2023}}
@inproceedings{ueda23interspeech_humandiffusion,abbr_publisher={Proceedings of Interspeech},booktitle={Proceedings of Interspeech},title={HumanDiffusion: diffusion model using perceptual gradients},author={Ueda, Yota and Takamichi, Shinnosuke and Saito, Yuki and Takamune, Norihiro and Saruwatari, Hiroshi},year={2023}}
@inproceedings{watanabe23asja_coconut,abbr_publisher={日本音響学会秋季研究発表会},booktitle={日本音響学会秋季研究発表会},title={Coco-Nut: 自由記述文による声質制御に向けた多話者音声・声質自由記述ペアデータセット},author={亞椰, 渡邊 and 慎之介, 高道 and 佑樹, 齋藤 and 徳泰, 辛 and 洋, 猿渡},year={2023}}
@inproceedings{seki23asjs_dark-data,abbr_publisher={日本音響学会春季研究発表会},booktitle={日本音響学会春季研究発表会},title={学習・評価ループを用いたデータ選択によるダークデータからの音声合成},author={健太郎, 関 and 慎之介, 高道 and 高明, 佐伯 and 洋, 猿渡},year={2023}}
@inproceedings{nakamura23icassp_jacappella,abbr_publisher={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},title={{jaCappella} corpus: A Japanese a cappella vocal ensemble corpus},author={Nakamura, Tomohiko and Takamichi, Shinnosuke and Tanji, Naoko and Fukayama, Satoru and Saruwatari, Hiroshi},year={2023}}
@inproceedings{watanabe23icassp_mid-attribute,abbr_publisher={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},title={Mid-attribute Speaker Generation using Optimal-Transport-based Interpolation of Gaussian Mixture Models},author={Watanabea, Aya and Takamichi, Shinnosuke and Saito, Yuki and Xin, Detai and Saruwatari, Hiroshi},year={2023}}
@inproceedings{ohnaka23icassp_visual-onoma-to-wave,abbr_publisher={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},title={Visual onoma-to-wave: environmental sound synthesis from visual onomatopoeias and sound-source images},author={Ohnaka, Hien and Takamichi, Shinnosuke and Imoto, Keisuke and Okamoto, Yuki and Fujii, Kazuki and Saruwatari, Hiroshi},year={2023}}
@inproceedings{nakano23slt_visual-text-to-speech,abbr_publisher={Proceedings of IEEE Spoken Language Technology Workshop (SLT)},booktitle={Proceedings of IEEE Spoken Language Technology Workshop (SLT)},title={{vTTS}: visual-text to speech},author={Nakano, Yoshifumi and Saeki, Takaaki and Takamichi, Shinnosuke and Sudoh, Katsuhito and Saruwatari, Hiroshi},year={2023}}