reference. Why transformers need adam: A hessian perspective [zhang2024transformers]
reference. Why transformers need adam: A hessian perspective [zhang2024transformers]
@article{zhang2024transformers,
title = {Why transformers need adam: A hessian perspective},
author = {Zhang, Yushun and Chen, Congliang and Ding, Tian and Li, Ziniu and Sun, Ruoyu and Luo, Zhiquan},
year = {2024},
journal = {Advances in Neural Information Processing Systems},
volume = {37},
pages = {131786--131823}
}