Selected Publications

Please see my Google Scholar for complete publication list.

Movie Gen: A Cast of Media Foundation Models
Meta's Movie Gen team
[Webpage] / [arXiv] / [MovieGenBench (GitHub)] / [bibtex]


  @misc{polyak2024moviegencastmedia,
    title={Movie Gen: A Cast of Media Foundation Models}, 
    author={Adam Polyak and Amit Zohar and Andrew Brown and Andros Tjandra and Animesh Sinha and Ann Lee and Apoorv Vyas and Bowen Shi and Chih-Yao Ma and Ching-Yao Chuang and David Yan and Dhruv Choudhary and Dingkang Wang and Geet Sethi and Guan Pang and Haoyu Ma and Ishan Misra and Ji Hou and Jialiang Wang and Kiran Jagadeesh and Kunpeng Li and Luxin Zhang and Mannat Singh and Mary Williamson and Matt Le and Matthew Yu and Mitesh Kumar Singh and Peizhao Zhang and Peter Vajda and Quentin Duval and Rohit Girdhar and Roshan Sumbaly and Sai Saketh Rambhatla and Sam Tsai and Samaneh Azadi and Samyak Datta and Sanyuan Chen and Sean Bell and Sharadh Ramaswamy and Shelly Sheynin and Siddharth Bhattacharya and Simran Motwani and Tao Xu and Tianhe Li and Tingbo Hou and Wei-Ning Hsu and Xi Yin and Xiaoliang Dai and Yaniv Taigman and Yaqiao Luo and Yen-Cheng Liu and Yi-Chiao Wu and Yue Zhao and Yuval Kirstain and Zecheng He and Zijian He and Albert Pumarola and Ali Thabet and Artsiom Sanakoyeu and Arun Mallya and Baishan Guo and Boris Araya and Breena Kerr and Carleigh Wood and Ce Liu and Cen Peng and Dimitry Vengertsev and Edgar Schonfeld and Elliot Blanchard and Felix Juefei-Xu and Fraylie Nord and Jeff Liang and John Hoffman and Jonas Kohler and Kaolin Fire and Karthik Sivakumar and Lawrence Chen and Licheng Yu and Luya Gao and Markos Georgopoulos and Rashel Moritz and Sara K. Sampson and Shikai Li and Simone Parmeggiani and Steve Fine and Tara Fowler and Vladan Petrovic and Yuming Du},
    year={2024},
    eprint={2410.13720},
    archivePrefix={arXiv},
    primaryClass={cs.CV},
    url={https://arxiv.org/abs/2410.13720}, 
}

Emu: Enhancing Image Generation Models Using Photogenic Needles in a Haystack
Xiaoliang Dai^*, Ji Hou^*, Chih-Yao Ma^*, Sam Tsai^*, Jialiang Wang^*, Rui Wang^*, Peizhao Zhang^*, Simon Vandenhende, Xiaofang Wang, Abhimanyu Dubey, Matthew Yu, Abhishek Kadian, Filip Radenovic, Dhruv Mahajan, Kunpeng Li, Yue Zhao, Vladan Petrovic, Mitesh Kumar Singh, Simran Motwani, Yi Wen, Yiwen Song, Roshan Sumbaly⁺, Vignesh Ramanathan⁺, Zijian He⁺, Peter Vajda⁺, Devi Parikh⁺
(^*: Core contributors: equal contribution, alphabetical order.)
(⁺: Equal last authors.)
[arXiv] / [bibtex]


  @article{dai2023emu,
    title={Emu: Enhancing image generation models using photogenic needles in a haystack},
    author={Dai, Xiaoliang and Hou, Ji and Ma, Chih-Yao and Tsai, Sam and Wang, Jialiang and Wang, Rui and Zhang, Peizhao and Vandenhende, Simon and Wang, Xiaofang and Dubey, Abhimanyu and others},
    journal={arXiv preprint arXiv:2309.15807},
    year={2023}
  }

RoPAWS: Robust Semi-supervised Representation Learning from Uncurated Data
Sangwoo Mo, Jong-Chyi Su, Chih-Yao Ma, Mido Assran, Ishan Misra, Licheng Yu, Sean Bell
International Conference on Learning Representations (ICLR), 2023
[arXiv] / [GitHub] / [bibtex]


  @inproceedings{mo2023ropaws,
    title={RoPAWS: Robust Semi-supervised Representation Learning from Uncurated Data},
    author={Mo, Sangwoo and Su, Jong-Chyi and Ma, Chih-Yao and Assran, Mido and Misra, Ishan and Yu, Licheng and Bell, Sean},
    booktitle={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2023}
  }

Trainable Projected Gradient Method for Robust Fine-tuning
Junjiao Tian, Zecheng He, Xiaoliang Dai, Chih-Yao Ma, Yen-Cheng Liu, Zsolt Kira
Computer Vision and Pattern Recognition (CVPR), 2023
[arXiv] / [GitHub] / [bibtex]


  @inproceedings{tian2023trainable,
    title={Trainable Projected Gradient Method for Robust Fine-tuning},
    author={Tian, Junjiao and He, Zecheng and Dai, Xiaoliang and Ma, Chih-Yao and Liu, Yen-Cheng and Kira, Zsolt},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={7836--7845},
    year={2023}
  }

Structure-Encoding Auxiliary Tasks for Improved Visual Representation in Vision-and-Language Navigation
Chia-Wen Kuo, Chih-Yao Ma, Judy Hoffman, Zsolt Kira
Winter Conference on Applications of Computer Vision (WACV), 2022
[arXiv] / [Project] / [bibtex]


  @inproceedings{kuo2023structure,
    title={Structure-Encoding Auxiliary Tasks for Improved Visual Representation in Vision-and-Language Navigation},
    author={Chia-Wen Kuo and Chih-Yao Ma and Judy Hoffman and Zsolt Kira},
    booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
    pages={1104--1113},
    year={2023}
  }

Polyhistor: Parameter-Efficient Multi-Task Adaptation for Dense Vision Tasks
Yen-Cheng Liu, Chih-Yao Ma, Junjiao Tian, Zijian He, Zsolt Kira
Conference on Neural Information Processing Systems (NeurIPS), 2022
[arXiv] / [Project] / [GitHub] (coming soon) [bibtex]


@article{liu2022polyhistor,
  title={Polyhistor: Parameter-Efficient Multi-Task Adaptation for Dense Vision Tasks},
  author={Liu, Yen-Cheng and Ma, Chih-Yao and Tian, Junjiao and He, Zijian and Kira, Zsolt},
  journal={Advances in neural information processing systems},
  year={2022}
}

Open-Set Semi-Supervised Object Detection
Yen-Cheng Liu, Chih-Yao Ma, Xiaoliang Dai, Junjiao Tian, Peter Vadja, Zijian He, Zsolt Kira
European Conference on Computer Vision (ECCV), 2022 (Oral)
[arXiv] / [Project] / [GitHub] (coming soon) [bibtex]


@inproceedings{liu2022open,
  title={Open-Set Semi-Supervised Object Detection},
  author={Liu, Yen-Cheng and Ma, Chih-Yao and Dai, Xiaoliang and Tian, Junjiao and Vajda, Peter and He, Zijian and Kira, Zsolt},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2022}
}

Cross-Domain Adaptive Teacher for Object Detection
Yu-Jhe Li, Xiaoliang Dai, Chih-Yao Ma, Yen-Cheng Liu, Kan Chen, Bichen Wu, Zijian He, Kris Kitani, Peter Vadja
Computer Vision and Pattern Recognition (CVPR), 2022
[PDF] / [GitHub] / [Project] / [bibtex]


@inproceedings{li2022cross,
  title={Cross-Domain Adaptive Teacher for Object Detection},
  author={Li, Yu-Jhe and Dai, Xiaoliang and Ma, Chih-Yao and Liu, Yen-Cheng and Chen, Kan and Wu, Bichen and He, Zijian and Kitani, Kris and Vajda, Peter},
  booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2022}
}

Unbiased Teacher v2: Semi-supervised Object Detection for Anchor-free and Anchor-based Detectors
Yen-Cheng Liu, Chih-Yao Ma, Zsolt Kira
Computer Vision and Pattern Recognition (CVPR), 2022
[arXiv] / [PDF] / [GitHub] / [Project] / [bibtex]


@InProceedings{Liu_2022_CVPR,
    author    = {Liu, Yen-Cheng and Ma, Chih-Yao and Kira, Zsolt},
    title     = {Unbiased Teacher v2: Semi-Supervised Object Detection for Anchor-Free and Anchor-Based Detectors},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {9819-9828}
}

Hierarchical Cross-Modal Agent for Robotics Vision-and-Language Navigation
Muhammad Zubair Irshad, Chih-Yao Ma, Zsolt Kira
IEEE International Conference on Robotics and Automation (ICRA), 2021
[arXiv] / [GitHub] / [Project] / [bibtex]


@inproceedings{irshad2021hierarchical,
  title={Hierarchical Cross-Modal Agent for Robotics Vision-and-Language Navigation},
  author={Muhammad Zubair Irshad and Chih-Yao Ma and Zsolt Kira},
  booktitle={Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)},
  year={2021},
  url={https://arxiv.org/abs/2104.10674}
}

Unbiased Teacher for Semi-Supervised Object Detection
Yen-Cheng Liu, Chih-Yao Ma, Zijian He, Chia-Wen Kuo, Kan Chen, Peizhao Zhang, Bichen Wu, Zsolt Kira, Peter Vajda
International Conference on Learning Representations (ICLR), 2021
[arXiv] / [GitHub] / [Project] / [OpenReview] / [bibtex]


  @inproceedings{liu2021unbiased,
    title={Unbiased Teacher for Semi-Supervised Object Detection},
    author={Liu, Yen-Cheng and Ma, Chih-Yao and He, Zijian and Kuo, Chia-Wen and Chen, Kan and Zhang, Peizhao and Wu, Bichen and Kira, Zsolt and Vajda, Peter},
    booktitle={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021},
}

Learning to Generate Grounded Visual Captions without Localization Supervision
Chih-Yao Ma, Yannis Kalantidis, Ghassan AlRegib, Peter Vajda, Marcus Rohrbach, Zsolt Kira
European Conference on Computer Vision (ECCV), 2020
[arXiv] / [GitHub] / [Project] / [ML@GT] / [bibtex]


  @inproceedings{ma2020learning,
    title={Learning to Generate Grounded Visual Captions without Localization Supervision},
    author={Ma, Chih-Yao and Kalantidis, Yannis and AlRegib, Ghassan and Vajda, Peter and Rohrbach, Marcus and Kira, Zsolt},
    booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
    year={2020},
    url={https://arxiv.org/abs/1906.00283},
}

FeatMatch: Feature-Based Augmentation for Semi-Supervised Learning
Chia-Wen Kuo, Chih-Yao Ma, Jia-Bin Huang, Zsolt Kira
European Conference on Computer Vision (ECCV), 2020
[arXiv] / [Project] / [GitHub] / [bibtex]


  @inproceedings{kuo2020featmatch,
    title={FeatMatch: Feature-Based Augmentationfor Semi-Supervised Learning},
    author={Kuo, Chia-Wen and Ma, Chih-Yao and Huang, Jia-Bin and Kira, Zsolt}, 
    booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
    year={2020},
    url={https://arxiv.org/abs/2007.08505}
  }

Who2com: Collaborative Perception Via Learnable Handshake Communication
Yen-Cheng Liu, Junjiao Tian, Chih-Yao Ma, Nathaniel Glaser, Chia-Wen Kuo, Zsolt Kira
International Conference on Robotics and Automation (ICRA), 2020
[arXiv] [GitHub] / [Project] / [bibtex]


  @inproceedings{liu2020who2com,
    title={Who2com: Collaborative Perception via Learnable Handshake Communication},
    author={Liu, Yen-Cheng and Tian, Junjiao and Ma, Chih-Yao and Glaser, Nathan and Kuo, Chia-Wen and Kira, Zsolt},
    booktitle={Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)},
    year={2020},
    url={https://arxiv.org/abs/2003.09575},
}

Manifold Graph with Learned Prototypes for Semi-Supervised Image Classification
Chia-Wen Kuo, Chih-Yao Ma, Jia-Bin Huang, Zsolt Kira
Technical Report, 2019
[arXiv] / [Project] / [bibtex]


  @article{kuo2019manifold,
    title={Manifold Graph with Learned Prototypes for Semi-Supervised Image Classification},
    author={Kuo, Chia-Wen and Ma, Chih-Yao and Huang, Jia-Bin and Kira, Zsolt},
    journal={arXiv preprint arXiv:1906.05202},
    year={2019},
    url={https://arxiv.org/abs/1906.05202},
}

The Regretful Agent: Heuristic-Aided Navigation through Progress Estimation
Chih-Yao Ma, Zuxuan Wu, Ghassan AlRegib, Caiming Xiong, Zsolt Kira
Computer Vision and Pattern Recognition (CVPR), 2019 (Oral)
[arXiv] / [GitHub] / [Project] / [Poster] / [bibtex]


@inproceedings{ma2019theregretful,
  title={The Regretful Agent: Heuristic-Aided Navigation through Progress Estimation},
  author={Ma, Chih-Yao and Wu, Zuxuan and AlRegib, Ghassan and Xiong, Caiming and Kira, Zsolt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2019},
  url={https://arxiv.org/abs/1903.01602},
}

AdaFrame: Adaptive Frame Selection for Fast Video Recognition
Zuxuan Wu, Caiming Xiong, Chih-Yao Ma, Richard Socher, Larry S Davis
Computer Vision and Pattern Recognition (CVPR), 2019

[arXiv] / [Poster] / [bibtex]


@inproceedings{wu2019adaframe,
  title={AdaFrame: Adaptive Frame Selection for Fast Video Recognition},
  author={Wu, Zuxuan and Xiong, Caiming and Ma, Chih-Yao and Socher, Richard and Davis, Larry S},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2019},
  url={https://arxiv.org/abs/1811.12432},
}

Self-Monitoring Navigation Agent via Auxiliary Progress Estimation
Chih-Yao Ma, Jiasen Lu, Zuxuan Wu, Ghassan AlRegib, Zsolt Kira, Richard Socher, Caiming Xiong
International Conference on Learning Representations (ICLR), 2019
(Top 7% of reviews)

[arXiv] / [OpenReview] / [GitHub] / [Project] / [Poster] / [ML@GT] / [bibtex]


@inproceedings{ma2019selfmonitoring,
  title={Self-Monitoring Navigation Agent via Auxiliary Progress Estimation},
  author={Ma, Chih-Yao and Lu, Jiasen and Wu, Zuxuan and AlRegib, Ghassan and Kira, Zsolt and Socher, Richard and Xiong, Caiming},
  booktitle={Proceedings of the International Conference on Learning Representations (ICLR)},
  year={2019},
  url={https://arxiv.org/abs/1901.03035},
}

Attend and Interact: Higher-Order Object Interactions for Video Understanding
Chih-Yao Ma, Asim Kadav, Iain Melvin, Zsolt Kira, Ghassan AlRegib, Hans Peter Graf
Computer Vision and Pattern Recognition (CVPR), 2018

[arXiv] / [Project] / [Poster] / [ML@GT] / [bibtex]


@inproceedings{ma2018attend,
  title={Attend and Interact: Higher-Order Object Interactions for Video Understanding},
  author={Ma, Chih-Yao and Kadav, Asim and Melvin, Iain and Kira, Zsolt and AlRegib, Ghassan and Graf, Hans Peter},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2018}
}

TS-LSTM and temporal-inception: Exploiting spatiotemporal dynamics for activity recognition
Chih-Yao Ma^*, Min-Hung Chen^*, Zsolt Kira, and Ghassan AlRegib
Signal Processing: Image Communication, 2018
(^*: equal contribution)

[arXiv] / [GitHub] / [Project] / [bibtex]


@article{ma2019ts,
  title={TS-LSTM and temporal-inception: Exploiting spatiotemporal dynamics for activity recognition},
  author={Ma, Chih-Yao and Chen, Min-Hung and Kira, Zsolt and AlRegib, Ghassan},
  journal={Signal Processing: Image Communication},
  volume={71},
  pages={76--87},
  year={2019},
  publisher={Elsevier}
}

Grounded Objects and Interactions for Video Captioning
Chih-Yao Ma, Asim Kadav, Iain Melvin, Zsolt Kira, Ghassan AlRegib, Hans Peter Graf
Neural Information Processing Systems (NeurIPS) Workshop on Visually-Grounded Interaction and Language, 2017

[arXiv] / [bibtex]


@article{ma2017grounded,
  title={Grounded Objects and Interactions for Video Captioning},
  author={Ma, Chih-Yao and Kadav, Asim and Melvin, Iain and Kira, Zsolt and AlRegib, Ghassan and Graf, Hans Peter},
  journal={arXiv preprint arXiv:1711.06354},
  year={2017}
}

Learning-based saliency model with depth information
Chih-Yao Ma and Hsueh-Ming Hang
Journal of vision, 2015

[Paper] / [bibtex]


@article{ma2015learning,
  title={Learning-based saliency model with depth information},
  author={Ma, Chih-Yao and Hang, Hsueh-Ming},
  journal={Journal of vision},
  volume={15},
  number={6},
  pages={19--19},
  year={2015},
  publisher={The Association for Research in Vision and Ophthalmology}
}

About Me

Kevin Chih-Yao Ma

Career

Microsoft AI

Meta

Meta

Meta

Meta

Salesforce Research

NEC-Labs Machine Learning

Georgia Tech

National Chiao Tung University

National Chiao Tung University

Selected Publications

Research Interest

Services