publications | Zhihao Zhang

2024

ICML

Accelerating retrieval-augmented language model serving with speculation

Zhihao Zhang, Alan Zhu , Lijie Yang , and 4 more authors

To appear at ICML 2024, 2024

@article{zhang2024accelerating,
  title = {Accelerating retrieval-augmented language model serving with speculation},
  author = {Zhang, Zhihao and Zhu, Alan and Yang, Lijie and Xu, Yihua and Li, Lanting and Phothilimthana, Phitchaya Mangpo and Jia, Zhihao},
  journal = {To appear at ICML 2024},
  year = {2024},
}

NeurIPS

Communication Bounds for the Distributed Experts Problem

Zhihao Jia , Qi Pang , Trung Tran , and 3 more authors (in alphabetic order)

In The Thirty-eighth Annual Conference on Neural Information Processing Systems , 2024
ICLR

TidalDecode: Fast and Accurate LLM Decoding with Position Persistent Sparse Attention

Lijie Yang ^* , Zhihao Zhang ^* , Zhuofu Chen , and 2 more authors

2024

2023

ASPLOS

Specinfer: Accelerating generative llm serving with speculative inference and token tree verification

Xupeng Miao ^* , Gabriele Oliaro ^* , Zhihao Zhang ^* , and 7 more authors

To appear at ASPLOS 2024, 2023

arXiv Bib

@article{miao2023specinfer,
  title = {Specinfer: Accelerating generative llm serving with speculative inference and token tree verification},
  author = {Miao, Xupeng and Oliaro, Gabriele and Zhang, Zhihao and Cheng, Xinhao and Wang, Zeyu and Wong, Rae Ying Yee and Chen, Zhuoming and Arfeen, Daiyaan and Abhyankar, Reyna and Jia, Zhihao},
  journal = {To appear at ASPLOS 2024},
  year = {2023},
  cofirst = {4}
}

arXiv

Towards efficient generative large language model serving: A survey from algorithms to systems

Xupeng Miao , Gabriele Oliaro , Zhihao Zhang, and 4 more authors

arXiv preprint arXiv:2312.15234, 2023

Bib

@article{miao2023towards,
  title = {Towards efficient generative large language model serving: A survey from algorithms to systems},
  author = {Miao, Xupeng and Oliaro, Gabriele and Zhang, Zhihao and Cheng, Xinhao and Jin, Hongyi and Chen, Tianqi and Jia, Zhihao},
  journal = {arXiv preprint arXiv:2312.15234},
  year = {2023}
}

2021

ICLR

GradSign: Model Performance Inference with Theoretical Insights

Zhihao Zhang, and Zhihao Jia

In International Conference on Learning Representations , 2021

Bib PDF

@inproceedings{zhang2021gradsign,
  title = {GradSign: Model Performance Inference with Theoretical Insights},
  author = {Zhang, Zhihao and Jia, Zhihao},
  booktitle = {International Conference on Learning Representations},
  year = {2021},
}

TITS

Spatio-temporal graph dual-attention network for multi-agent prediction and tracking

Jiachen Li , Hengbo Ma , Zhihao Zhang, and 2 more authors

IEEE Transactions on Intelligent Transportation Systems, 2021

arXiv Bib

@article{li2021spatio,
  title = {Spatio-temporal graph dual-attention network for multi-agent prediction and tracking},
  author = {Li, Jiachen and Ma, Hengbo and Zhang, Zhihao and Li, Jinning and Tomizuka, Masayoshi},
  journal = {IEEE Transactions on Intelligent Transportation Systems},
  volume = {23},
  number = {8},
  pages = {10556--10569},
  year = {2021},
  publisher = {IEEE}
}

2020

arXiv

Social-wagdat: Interaction-aware trajectory prediction via wasserstein graph double-attention network

Jiachen Li , Hengbo Ma , Zhihao Zhang, and 1 more author

arXiv preprint arXiv:2002.06241, 2020

Bib

@article{li2020social,
  title = {Social-wagdat: Interaction-aware trajectory prediction via wasserstein graph double-attention network},
  author = {Li, Jiachen and Ma, Hengbo and Zhang, Zhihao and Tomizuka, Masayoshi},
  journal = {arXiv preprint arXiv:2002.06241},
  year = {2020}
}