reference. Why transformers need adam: A hessian perspective [zhang2024transformers]
reference. Why transformers need adam: A hessian perspective [zhang2024transformers]
@article{zhang2024transformers, title = {Why transformers need adam: A hessian perspective}, author = {Zhang, Yushun and Chen, Congliang and Ding, Tian and Li, Ziniu and Sun, Ruoyu and Luo, Zhiquan}, year = {2024}, journal = {Advances in Neural Information Processing Systems}, volume = {37}, pages = {131786--131823} }