From 6c3c10d3a52d1cf1bf81f6d5c93b2c24d0f06f35 Mon Sep 17 00:00:00 2001 From: Shiguang WU Date: Tue, 17 Sep 2024 21:49:54 +0800 Subject: [PATCH] update papers last week --- generate_readme.py | 13 +++++++++---- .../linear-attention/papers.csv | 3 ++- papers/mechanistic-engineering/papers.csv | 5 ++++- .../chain-of-thought/papers.csv | 3 ++- .../phenomena-of-interest/hallucination/papers.csv | 3 ++- papers/phenomena-of-interest/learning/papers.csv | 3 ++- .../training-dynamics/papers.csv | 3 ++- 7 files changed, 23 insertions(+), 10 deletions(-) diff --git a/generate_readme.py b/generate_readme.py index b3448ff..5046c42 100644 --- a/generate_readme.py +++ b/generate_readme.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import os import csv import json @@ -25,7 +26,7 @@ def generate_table_of_content(category_info): header = "Table of Content\n====================\n\n" header += ( - "- [Awesome Transformers LM Analytics ](#awesome-transformers-lm-analytics-)\n" + "- [Awesome Language Model Analysis](#awesome-language-model-analysis-)\n" "- [Table of Content](#table-of-content)\n" ) footer = "\n" @@ -56,7 +57,7 @@ def gen_entry(name): def generate_section_template(category_info): - header_template = "## **{}**\n\n**[`^ back to top ^`](#awesome-transformers-lm-analytics-)**\n\n{}" + header_template = "## **{}**\n\n**[`^ back to top ^`](#awesome-language-model-analysis-)**\n\n{}" body_template = """
paper list (click to fold / unfold) @@ -138,10 +139,14 @@ def get_section_list(topic): # read as dict, the first line is the header - with open(p, "r") as f: + with open(p, "r", encoding="utf-8") as f: reader = csv.DictReader(f) # sort by date - reader = sorted(reader, key=lambda x: x["Date"], reverse=True) + try: + reader = sorted(reader, key=lambda x: x["Date"], reverse=True) + except Exception as e: + print(f"Error reading {p}: {e}") + return [], [] # sanity check of each row for row in reader: assert len(row.keys()) == 4, f"topic: {topic}, row: {row}" diff --git a/papers/architectural-effectivity/linear-attention/papers.csv b/papers/architectural-effectivity/linear-attention/papers.csv index e0a3bbe..26afa38 100644 --- a/papers/architectural-effectivity/linear-attention/papers.csv +++ b/papers/architectural-effectivity/linear-attention/papers.csv @@ -4,4 +4,5 @@ Transformers are SSMs: Generalized Models and Efficient Algorithms Through Struc Just read twice: closing the recall gap for recurrent language models,2024-07-07,http://arxiv.org/abs/2407.05483,Simran Arora; Aman Timalsina; Aaryan Singhal; Benjamin Spector; Sabri Eyuboglu; Xinyi Zhao; Ashish Rao; Atri Rudra; Christopher Ré Transformers to SSMs: Distilling Quadratic Knowledge to Subquadratic Models,2024-08-19,http://arxiv.org/abs/2408.10189,Aviv Bick; Kevin Y. Li; Eric P. Xing; J. Zico Kolter; Albert Gu Recurrent Neural Networks Learn to Store and Generate Sequences using Non-Linear Representations,2024-08-20,http://arxiv.org/abs/2408.10920,Róbert Csordás; Christopher Potts; Christopher D. Manning; Atticus Geiger -"Theory, Analysis, and Best Practices for Sigmoid Self-Attention",2024-09-06,http://arxiv.org/abs/2409.04431,Jason Ramapuram; Federico Danieli; Eeshan Dhekane; Floris Weers; Dan Busbridge; Pierre Ablin; Tatiana Likhomanenko; Jagrit Digani; Zijin Gu; Amitis Shidani; Russ Webb \ No newline at end of file +"Theory, Analysis, and Best Practices for Sigmoid Self-Attention",2024-09-06,http://arxiv.org/abs/2409.04431,Jason Ramapuram; Federico Danieli; Eeshan Dhekane; Floris Weers; Dan Busbridge; Pierre Ablin; Tatiana Likhomanenko; Jagrit Digani; Zijin Gu; Amitis Shidani; Russ Webb +"Autoregressive + Chain of Thought (CoT) ≃ Recurrent: Recurrence's Role in Language Models and a Revist of Recurrent Transformer",2024-09-14,http://arxiv.org/abs/2409.09239,Xiang Zhang; Muhammad Abdul-Mageed; Laks V.S. Lakshmanan \ No newline at end of file diff --git a/papers/mechanistic-engineering/papers.csv b/papers/mechanistic-engineering/papers.csv index db7f573..5f59526 100644 --- a/papers/mechanistic-engineering/papers.csv +++ b/papers/mechanistic-engineering/papers.csv @@ -38,4 +38,7 @@ The Mechanics of Conceptual Interpretation in GPT Models: Interpretative Insight A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive Language Models,2024-08-16,http://arxiv.org/abs/2408.08590,Geonhee Kim; Marco Valentino; André Freitas Transformer Circuit Faithfulness Metrics are not Robust,2024-07-11,http://arxiv.org/abs/2407.08734,Joseph Miller; Bilal Chughtai; William Saunders LLM Circuit Analyses Are Consistent Across Training and Scale,2024-07-15,http://arxiv.org/abs/2407.10827,Curt Tigges; Michael Hanna; Qinan Yu; Stella Biderman -Modularity in Transformers: Investigating Neuron Separability & Specialization,2024-08-30,http://arxiv.org/abs/2408.17324,Nicholas Pochinkov; Thomas Jones; Mohammed Rashidur Rahman \ No newline at end of file +Modularity in Transformers: Investigating Neuron Separability & Specialization,2024-08-30,http://arxiv.org/abs/2408.17324,Nicholas Pochinkov; Thomas Jones; Mohammed Rashidur Rahman +Extracting Paragraphs from LLM Token Activations,2024-09-10,http://arxiv.org/abs/2409.06328,Nicholas Pochinkov; Angelo Benoit; Lovkush Agarwal; Zainab Ali Majid; Lucile Ter-Minassian +Explaining Datasets in Words: Statistical Models with Natural Language Parameters,2024-09-13,http://arxiv.org/abs/2409.08466,Ruiqi Zhong; Heng Wang; Dan Klein; Jacob Steinhardt +Optimal ablation for interpretability,2024-09-16,http://arxiv.org/abs/2409.09951,Maximilian Li; Lucas Janson \ No newline at end of file diff --git a/papers/phenomena-of-interest/chain-of-thought/papers.csv b/papers/phenomena-of-interest/chain-of-thought/papers.csv index c2ad6c4..9ff3a01 100644 --- a/papers/phenomena-of-interest/chain-of-thought/papers.csv +++ b/papers/phenomena-of-interest/chain-of-thought/papers.csv @@ -8,4 +8,5 @@ The Expressive Power of Transformers with Chain of Thought,2023-10-13,https://op Iteration Head: A Mechanistic Study of Chain-of-Thought,2024-06-04,http://arxiv.org/abs/2406.02128,Vivien Cabannes; Charles Arnal; Wassim Bouaziz; Alice Yang; Francois Charton; Julia Kempe On the Representational Capacity of Neural Language Models with Chain-of-Thought Reasoning,2024-06-20,http://arxiv.org/abs/2406.14197,Franz Nowak; Anej Svete; Alexandra Butoi; Ryan Cotterell Unveiling the Statistical Foundations of Chain-of-Thought Prompting Methods,2024-08-25,http://arxiv.org/abs/2408.14511,Xinyang Hu; Fengzhuo Zhang; Siyu Chen; Zhuoran Yang -"Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning",2024-07-01,http://arxiv.org/abs/2407.01687,Akshara Prabhakar; Thomas L. Griffiths; R. Thomas McCoy \ No newline at end of file +"Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning",2024-07-01,http://arxiv.org/abs/2407.01687,Akshara Prabhakar; Thomas L. Griffiths; R. Thomas McCoy +"Autoregressive + Chain of Thought (CoT) ≃ Recurrent: Recurrence's Role in Language Models and a Revist of Recurrent Transformer",2024-09-14,http://arxiv.org/abs/2409.09239,Xiang Zhang; Muhammad Abdul-Mageed; Laks V.S. Lakshmanan \ No newline at end of file diff --git a/papers/phenomena-of-interest/hallucination/papers.csv b/papers/phenomena-of-interest/hallucination/papers.csv index 7ecc3f5..0385f22 100644 --- a/papers/phenomena-of-interest/hallucination/papers.csv +++ b/papers/phenomena-of-interest/hallucination/papers.csv @@ -6,4 +6,5 @@ Calibrated Language Models Must Hallucinate,2023-11-24,http://arxiv.org/abs/2311 The Curious Case of Hallucinatory Unanswerablity: Finding Truths in the Hidden States of Over-Confident Large Language Models,2023-10-18,http://arxiv.org/abs/2310.11877,Aviv Slobodkin; Omer Goldman; Avi Caciularu; Ido Dagan; Shauli Ravfogel Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?,2024-05-09,http://arxiv.org/abs/2405.05904,Zorik Gekhman; Gal Yona; Roee Aharoni; Matan Eyal; Amir Feder; Roi Reichart; Jonathan Herzig Estimating the Hallucination Rate of Generative AI,2024-06-11,http://arxiv.org/abs/2406.07457,Andrew Jesson; Nicolas Beltran-Velez; Quentin Chu; Sweta Karlekar; Jannik Kossen; Yarin Gal; John P. Cunningham; David Blei -Shared Imagination: LLMs Hallucinate Alike,2024-07-23,http://arxiv.org/abs/2407.16604,Yilun Zhou; Caiming Xiong; Silvio Savarese; Chien-Sheng Wu \ No newline at end of file +Shared Imagination: LLMs Hallucinate Alike,2024-07-23,http://arxiv.org/abs/2407.16604,Yilun Zhou; Caiming Xiong; Silvio Savarese; Chien-Sheng Wu +"LLMs Will Always Hallucinate, and We Need to Live With This",2024-09-09,http://arxiv.org/abs/2409.05746,Sourav Banerjee; Ayushi Agarwal; Saloni Singla \ No newline at end of file diff --git a/papers/phenomena-of-interest/learning/papers.csv b/papers/phenomena-of-interest/learning/papers.csv index af34d08..197abed 100644 --- a/papers/phenomena-of-interest/learning/papers.csv +++ b/papers/phenomena-of-interest/learning/papers.csv @@ -37,4 +37,5 @@ On the Generalization of Preference Learning with DPO,2024-08-06,http://arxiv.or Reasoning in Large Language Models: A Geometric Perspective,2024-07-02,http://arxiv.org/abs/2407.02678,Romain Cosentino; Sarath Shekkizhar Unforgettable Generalization in Language Models,2024-09-03,http://arxiv.org/abs/2409.02228,Eric Zhang; Leshem Chosen; Jacob Andreas The Many Faces of Optimal Weak-to-Strong Learning,2024-08-30,http://arxiv.org/abs/2408.17148,Mikael Møller Høgsgaard; Kasper Green Larsen; Markus Engelund Mathiasen -On the Empirical Complexity of Reasoning and Planning in LLMs,2024-04-17,http://arxiv.org/abs/2404.11041,Liwei Kang; Zirui Zhao; David Hsu; Wee Sun Lee \ No newline at end of file +On the Empirical Complexity of Reasoning and Planning in LLMs,2024-04-17,http://arxiv.org/abs/2404.11041,Liwei Kang; Zirui Zhao; David Hsu; Wee Sun Lee +Understanding Simplicity Bias towards Compositional Mappings via Learning Dynamics,2024-09-15,http://arxiv.org/abs/2409.09626,Yi Ren; Danica J. Sutherland \ No newline at end of file diff --git a/papers/phenomena-of-interest/training-dynamics/papers.csv b/papers/phenomena-of-interest/training-dynamics/papers.csv index 09edd91..263ed1b 100644 --- a/papers/phenomena-of-interest/training-dynamics/papers.csv +++ b/papers/phenomena-of-interest/training-dynamics/papers.csv @@ -29,4 +29,5 @@ Learning Dynamics of LLM Finetuning,2024-07-15,http://arxiv.org/abs/2407.10490,Y Parameter-Efficient Fine-Tuning for Continual Learning: A Neural Tangent Kernel Perspective,2024-07-24,http://arxiv.org/abs/2407.17120,Jingren Liu; Zhong Ji; YunLong Yu; Jiale Cao; Yanwei Pang; Jungong Han; Xuelong Li Global Convergence in Training Large-Scale Transformers,2024-08,https://klusowski.princeton.edu/sites/g/files/toruqf5901/files/documents/gao2024global.pdf,Cheng Gao; Yuan Cao; Zihao Li; Yihan He; Mengdi Wang; Han Liu; Jason M. Klusowski; Jianqing Fan On the Convergence of Encoder-only Shallow Transformers,2024-08,https://proceedings.neurips.cc/paper_files/paper/2023/file/a3cf318fbeec1126da21e9185ae9908c-Paper-Conference.pdf,Yongtao Wu; Fanghui Liu; Grigorios G Chrysos; Volkan Cevher -"The AdEMAMix Optimizer: Better, Faster, Older",2024-09-05,http://arxiv.org/abs/2409.03137,Matteo Pagliardini; Pierre Ablin; David Grangier \ No newline at end of file +"The AdEMAMix Optimizer: Better, Faster, Older",2024-09-05,http://arxiv.org/abs/2409.03137,Matteo Pagliardini; Pierre Ablin; David Grangier +Optimization Hyper-parameter Laws for Large Language Models,2024-09-07,http://arxiv.org/abs/2409.04777,Xingyu Xie; Kuangyu Ding; Shuicheng Yan; Kim-Chuan Toh; Tianwen Wei \ No newline at end of file