1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
| import os import json import random
random.seed(42)
output_base_dir = 'internlm/dataset' if not os.path.exists(output_base_dir): os.makedirs(output_base_dir)
input_templates = [ "Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.",
"Classification Request: Given the title '{title}', authored by '{authors}', and abstract '{abstract}', identify the research field of this paper.",
"Field Determination: Analyze the title '{title}', authors '{authors}', and abstract '{abstract}' to assign a discipline category.",
"Academic Categorization: Based on '{title}' (authors: '{authors}') and abstract content '{abstract}', classify this paper into a scientific domain.",
"Domain Assignment: Using the title '{title}', author list {authors}, and abstract text '{abstract}', determine the most relevant academic field.",
"Research Area Identification: From the paper titled '{title}' (by {authors}) and abstract '{abstract}', infer its primary research area.",
"Paper Taxonomy: Categorize the paper with title '{title}', authors {authors}, and abstract '{abstract}' into a specific scientific discipline.",
"Subject Labeling: With the metadata: Title '{title}', Authors {authors}, Abstract '{abstract}', generate a subject classification.",
"Knowledge Domain Inference: Based on '{title}' (by {authors}) and abstract '{abstract}', predict the broad field of study.",
"Scientific Field Prediction: Analyze the title '{title}', authors {authors}, and abstract '{abstract}' to output a single discipline label.",
"Multi-Metadata Classification: Integrate the paper\'s title '{title}', author affiliations '{authors}', and abstract '{abstract}' to assign a research category.", "分类请求:根据标题“{title}”、作者“{authors}”和摘要“{abstract}”,请确定该论文的研究领域。",
"领域判定:结合标题“{title}”、作者{authors}及摘要内容“{abstract}”,判断此论文所属学科类别。",
"学术分类:基于论文标题“{title}”(作者:{authors})和摘要“{abstract}”,将其划分到具体的科学领域。",
"学科标注:根据标题“{title}”、作者列表{authors}和摘要文本“{abstract}”,确定最相关的学术领域。",
"研究方向识别:从标题为“{title}”(作者{authors})及摘要“{abstract}”中推断其主要研究方向。",
"文献归类:将标题“{title}”、作者{authors}、摘要“{abstract}”的论文归类至特定学科门类。",
"主题分类:根据元数据:标题“{title}”、作者{authors}、摘要“{abstract}”,生成一个学科分类标签。",
"知识领域推断:基于标题“{title}”(作者{authors})及摘要“{abstract}”,预测其所属广泛研究领域。",
"科学领域预测:分析标题“{title}”、作者{authors}和摘要“{abstract}”,输出单一学科标签。",
"多维度分类:综合论文标题“{title}”、作者信息{authors}和摘要“{abstract}”,划分研究类别。",
"Label the research domain of this paper by analyzing:\nTitle: {title}\nAuthors: {authors}\nKey findings: {abstract}'", "Q: Which academic field does the paper '{title}' by {authors} belong to, given its abstract: '{abstract}'?\nA: The field is:", "This paper [{title}] authored by {authors} primarily focuses on ______ (fill in the field), as evidenced by the abstract: '{abstract}'.", "Reviewer Task: Based on the title '{title}', author affiliations {authors}, and abstract summary '{abstract}', assign a discipline category from the taxonomy codes.", "If the paper '{title}' by {authors} were a book in a library, which section would it shelve in? Abstract clues: '{abstract}'.",
"Step 1: Extract keywords from '{title}' and abstract: '{abstract}'.\nStep 2: Cross-reference with author '{authors}' expertise.\nStep 3: Output the dominant field.",
"Compare these metadata to classify the paper:\nTitle focus: {title}\nAuthor expertise: {authors}\nAbstract emphasis: {abstract}\nConclusion: The paper belongs to _____ field.",
"Can you accurately categorize {title} by {authors} just from this abstract? Prove it: '{abstract}'.",
"Inputs:\nMetadata: Title={title}, Authors={authors}\nContent: Abstract={abstract}\nProcessing: Apply field codes.\nOutput: Field=?",
"The DNA of this paper ({title} by {authors}) reveals its academic species. Abstract strand: '{abstract}'. Species identification:",
"Research Area Identification: From the paper titled '{title}' (by {authors}) and abstract '{abstract}', infer its primary research area.",
"Paper Taxonomy: Categorize the paper with title '{title}', authors {authors}, and abstract '{abstract}' into a specific scientific discipline.",
"Subject Labeling: With the metadata: Title '{title}', Authors {authors}, Abstract '{abstract}', generate a subject classification.",
"Knowledge Domain Inference: Based on '{title}' (by {authors}) and abstract '{abstract}', predict the broad field of study.",
"Discipline Prediction: Analyze the abstract '{abstract}' of the paper '{title}' authored by {authors} and suggest the academic domain.",
"Field Classification Task: Use the title '{title}', authors {authors}, and abstract '{abstract}' to assign a research category.",
"Scientific Area Determination: Given the information — Title: '{title}', Authors: {authors}, Abstract: '{abstract}' — identify the scientific domain.",
"Area Tagging: From the context of the paper '{title}' and its abstract '{abstract}', assign a field label.",
"Disciplinary Mapping: With the title '{title}', the author(s) {authors}, and the abstract '{abstract}', map this paper to a discipline.",
"Research Field Suggestion: Based on the content in the title '{title}' and abstract '{abstract}', recommend the research field.",
"Topic Classification: Classify the following paper by title '{title}', authors {authors}, and abstract '{abstract}'.",
"Academic Field Categorization: Given the title '{title}' and abstract '{abstract}', determine which academic field this paper falls into.",
"Scientific Discipline Inference: Determine the scientific discipline of the paper titled '{title}' (authors: {authors}) based on the abstract '{abstract}'.",
"Field Assignment Task: Use the provided paper metadata to assign the appropriate research area. Title: '{title}', Authors: {authors}, Abstract: '{abstract}'.",
"Content-Based Field Classification: Determine the field of study using the paper's title '{title}', authors {authors}, and abstract '{abstract}'.",
"Scholarly Classification Prompt: Use the paper title '{title}', author list {authors}, and abstract '{abstract}' to classify the research area.",
"Discipline Deduction: From the title '{title}', author list {authors}, and abstract '{abstract}', deduce the primary academic discipline.",
"Study Area Determination: Determine the core area of study of the paper titled '{title}' authored by {authors} from the abstract '{abstract}'.",
"Category Prediction Task: Predict the research category using the paper title '{title}' and abstract '{abstract}'.",
"Field Analysis Instruction: Based on metadata (title: '{title}', authors: {authors}, abstract: '{abstract}'), identify the study field.",
"**分类请求:**根据标题“{title}”、作者“{authors}”和摘要“{abstract}”,请确定该论文的研究领域。",
"**领域判定:**结合标题“{title}”、作者{authors}及摘要内容“{abstract}”,判断此论文所属学科类别。",
"**学术分类:**基于论文标题“{title}”(作者:{authors})和摘要“{abstract}”,将其划分到具体的科学领域。",
"**主题标签生成:**请依据论文的标题“{title}”、作者“{authors}”及摘要“{abstract}”,为其生成对应的学科标签。",
"**领域识别任务:**请根据以下论文信息(标题:“{title}”,作者:{authors},摘要:“{abstract}”)识别其研究领域。",
"**学科归类请求:**请将题为“{title}”、作者为{authors}的论文,基于摘要“{abstract}”进行学科归类。",
"**研究领域预测:**请根据论文摘要“{abstract}”内容,预测标题为“{title}”的论文的研究领域。",
"**论文领域自动识别:**输入信息包括标题“{title}”、作者{authors}、摘要“{abstract}”,请自动判断其学科领域。",
"**学术方向分类任务:**请根据以下论文元数据,判断其研究方向。标题:{title},作者:{authors},摘要:{abstract}。",
"**科学领域分类:**根据论文题目“{title}”和作者“{authors}”、摘要“{abstract}”,将其归类到相应的科学领域。",
"**领域推理任务:**利用标题“{title}”、作者“{authors}”及摘要“{abstract}”对论文进行研究方向推理。",
"**领域划分:**请根据“{title}”和“{abstract}”信息,作者为“{authors}”,判断其归属的学术领域。",
"**分类辅助:**请依据标题“{title}”和作者{authors}的摘要“{abstract}”内容,推荐一个合适的研究分类。",
"**领域归属分析:**根据论文内容判断其属于哪个研究领域。信息如下:标题:{title};作者:{authors};摘要:{abstract}。",
"**学科方向识别:**请根据摘要“{abstract}”和标题“{title}”,作者是“{authors}”,识别该论文的学科方向。",
"**论文归类任务:**依据论文元数据“{title}”、“{authors}”、“{abstract}”,请将其归类为某一学科类别。",
"hich academic field does this paper belong to? Based on its title '{title}', authors {authors}, and abstract '{abstract}', determine the most suitable classification.",
"Assign a scientific category to the paper below, using its metadata: Title: '{title}', Authors: {authors}, Abstract: '{abstract}'.",
"Summarize the domain of study that best fits the research described in '{title}' by {authors}. Consider the abstract: '{abstract}'.",
"Field estimation challenge: Based on the content of this scholarly work (Title: '{title}', by {authors}. Abstract: '{abstract}'), which field is it most aligned with?",
"Discipline tagging assistant: Help identify the most relevant field for the paper titled '{title}' by {authors}, summarized as: '{abstract}'.",
"Knowledge scope detection: Use the following metadata to detect the academic scope: Title - '{title}'; Authors - {authors}; Abstract - '{abstract}'.",
"Contextual paper classification: Examine the title and abstract provided, and place the research in an appropriate scientific taxonomy.",
"Suggest a domain label for the paper titled '{title}' with abstract '{abstract}'. Focus on broad scientific or technical fields.",
"Research domain detection: This paper (title: '{title}'; abstract: '{abstract}') was written by {authors}. What is its academic category?",
"Infer the scholarly classification from the semantic cues in the abstract '{abstract}', title '{title}', and authorship {authors}.",
"**请问这篇论文属于哪个研究领域?**以下是其基本信息:标题“{title}”,作者{authors},摘要“{abstract}”。", "**基于内容的领域分类:**请分析论文标题“{title}”和摘要“{abstract}”,判断其所属的科学门类。", "请对以下论文信息进行分类,包括标题“{title}”、作者{authors}和摘要“{abstract}”。", "**根据语义内容判断类别:**请从摘要“{abstract}”和标题“{title}”中提取关键信息,为论文分配一个学术领域。", "**帮我标注该论文的研究方向:**信息如下:{title},作者:{authors},摘要内容:“{abstract}”。", "**该研究更偏向哪个学科?**结合论文标题与摘要信息,请给出一个合理的分类建议。", "**从专业角度判断:**基于论文“{title}”与其研究摘要“{abstract}”,其应属于哪个专业领域?", "**请推荐一个学术标签,**用于表示这篇由{authors}撰写、标题为“{title}”的论文所属领域。", "**摘要分析分类:**请从该摘要“{abstract}”推测研究方向,并结合论文标题“{title}”做出归属判断。",
"**内容归类任务提示:**请使用该论文的元数据({title}、{authors}、{abstract})对其进行领域标签的生成。",
"Classify this paper into a research field. Title: '{title}', Authors: ({authors}), Abstract: '{abstract}'.",
"Given: title '{title}', authors '{authors}', abstract '{abstract}'. Determine the academic domain.",
"Use the abstract to assign a research category. Title: '{title}', Authors: '{authors}', Abstract: '{abstract}'.",
"Input: '{title}' by '{authors}'. Abstract: '{abstract}'. Output: scientific field.",
"From the title and abstract, categorize this paper. Title: '{title}'. Abstract: '{abstract}', Authors: ({authors}).",
"Can you help me figure out what field this paper belongs to? Here's the info: title '{title}', authors {authors}, abstract '{abstract}'.",
"I\'m trying to organize some papers. What category should this one go into? Title: '{title}', Authors: {authors}, Abstract: '{abstract}'.",
"I read this paper, but I'm unsure about its domain. Can you classify it? Title: '{title}', Abstract: '{abstract}', Authors: '{authors}'.",
"Which research area would you assign to this work based on its abstract and title? Title: '{title}', Authors: '{authors}', Abstract: '{abstract}'.",
"You are an academic journal editor. Based on the title '{title}', authors {authors}, and abstract '{abstract}', assign this paper to a suitable discipline.",
"As a librarian building a research taxonomy, determine the subject area for the paper: '{title}' by {authors} and abstract: '{abstract}'.",
"Act as a scientific reviewer. Categorize this manuscript by domain using: Title: '{title}', Abstract: '{abstract}', Author List: '{authors}'.",
"From the abstract '{abstract}' and title '{title}', (authors {authors}), what can you infer about the research domain of the paper?",
"What clues in the abstract '{abstract}' and title '{title}', (authors {authors}) suggest the field of study?",
"Analyze the keywords and topics in '{abstract}' and classify accordingly. And title '{title}', (authors {authors}).",
"[System] Input received. Paper Title: '{title}', Authors: {authors}, Abstract: '{abstract}'. Proceed to classify by domain.",
"[AI_Tagger] Please assign subject label based on: Title = '{title}', Abstract = '{abstract}', Author List: '{authors}'.",
"[MetadataAnalyzer] Classify this entry using embedded text: '{abstract}' (title: '{title}'), (authors {authors}).",
"Title = '{title}', Abstract = '{abstract}', Author List: '{authors}'. This paper was submitted for classification. Use the metadata to determine the category.",
"Title = '{title}', Abstract = '{abstract}', Author List: '{authors}'. Generate a domain label based on the core ideas from the abstract and title provided.",
"请根据标题“{title}”、作者{authors}和摘要“{abstract}”,对该论文进行学科分类。",
"任务:对以下论文分类。标题:{title};摘要:{abstract}; 作者列表“{authors}”。",
"输入元信息:标题“{title}”,摘要“{abstract}”,作者列表“{authors}”。输出:研究领域。",
"Title = '{title}' Author List: '{authors}', Abstract = '{abstract}',. 分类需求:根据论文摘要和标题内容,为其指定一个研究类别。",
"给出以下论文信息,请判断所属学科门类。 Title = '{title}', Abstract = '{abstract}', Author List: '{authors}'.",
"请问这篇文章属于哪个领域?标题是“{title}”,摘要如下:“{abstract}”。作者列表“{authors}”。",
"我正在整理文献,不确定这篇论文的研究方向。你能帮我分类吗?信息如下。标题是“{title}”,摘要如下:“{abstract}”。作者列表“{authors}”。",
"根据摘要“{abstract}”的内容,这篇题为“{title}” (作者列表“{authors}”)的论文应该归属哪个研究领域?",
"我不太确定这篇文章的学科归属,可以请你判断一下吗?标题是“{title}”,摘要如下:“{abstract}”。作者列表“{authors}”。",
"你是一位资深学术期刊编辑,请根据标题“{title}”、作者{authors}、摘要“{abstract}”为其确定研究方向。",
"作为图书馆分类员,你需要为这篇论文分配一个学科分类。标题“{title}”、作者{authors}、摘要“{abstract}”。",
"请模拟审稿人角色,为该论文选择一个最合适的研究领域。标题“{title}”、作者{authors}、摘要“{abstract}”。",
"标题“{title}”、作者{authors}、摘要“{abstract}”。 请模拟审稿人角色,为该论文选择一个最合适的研究领域。",
"从摘要“{abstract}”中的关键词判断,该论文属于哪一类学科?额外的信息:标题“{title}”、作者{authors}。",
"从研究目标和方法出发,请为该论文做出领域归属判断。标题“{title}”、作者{authors}、摘要“{abstract}”。",
"通过标题“{title}”及其对应的研究内容“{abstract}”,推断其最可能的研究方向。作者列表:“{authors}”。",
"[系统请求] 输入论文元信息:标题“{title}”、作者{authors}、摘要“{abstract}”。请进行自动分类。",
"[分类助手] 请为该论文分配一个领域标签。标题“{title}”、作者{authors}、摘要“{abstract}”。",
"[AI 分类引擎] 任务输入:{title},摘要:{abstract}。请输出所属学科。作者列表:“{authors}”。",
"如果你只读了以下论文摘要“{abstract}”和标题“{title}”,(作者列表你可能不关心:“{authors}”)你会认为它属于哪个领域?",
"假设你是一个“论文归类机器人”,你的任务是为这篇论文打上一个准确的学科标签。标题“{title}”、作者{authors}、摘要“{abstract}”。",
"[System Instruction] Paper classification task initiated. Input: title '{title}', authors {authors}, abstract '{abstract}'. Please assign an appropriate research domain label.",
"[MetadataClassifier::Invoke] -> Analyze the paper with metadata {title}, {authors}, and {abstract}. Output: scientific discipline.",
"[Task: ResearchFieldDetection] Paper metadata received. Begin classification using the abstract and title.\n> Title: '{title}'\n> Authors: {authors}\n> Abstract: '{abstract}'",
"[CLASSIFY_PAPER] Inputs:\n- TITLE = '{title}'\n- AUTHORS = {authors}\n- ABSTRACT = '{abstract}'\n→ RETURN: FIELD_LABEL",
"[System Input] A new research paper has been submitted. Please determine the academic category based on:\n• Title: '{title}'\n• Authors: {authors}\n• Abstract: '{abstract}'",
"【系统指令】已接收到论文元数据。请根据标题“{title}”、作者{authors}和摘要“{abstract}”,判定所属学科领域。",
"【研究领域分类模块】接收到一篇新论文,请根据摘要与标题内容进行自动归类。\n→ 论文信息:{title},{authors},{abstract}",
"[调用接口:学科分类] 参数如下:标题:{title}作者:{authors}摘要:{abstract}→ 返回值:学术领域标签"
]
options = "A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
instruction_templates = [ "You are an AI academic librarian trained to classify research papers with 99%\ accuracy.", "[SYSTEM ROLE] Domain Classification Officer\n Mission: Categorize the paper", "你是个优秀的论文分类师", "As a meta-reviewer AI, you must:\n1. Identify 4 key terms from title of the paper\n2. Cross-check with authors publication history\n3. Map abstract to the most ralted subfileds", "By academic protocol GPT-2025, you are required to\n1. Disclose uncertainty if abstract is ambiguous\n2. Prioritize author-specified keywords in title\n3. Identify the most ralted subfileds", "Task: Teach a graduate student how to classify title.\nSteps:\na) Highlight disciplinary cues in abstract\nb) Explain why authors affiliations suggest _____ field\nc) Conclude with the option [A-Z Arxiv field code]", "[AI CLASSIFIER v3.1 INPUT]\nTitle: title\nAuthors: authors\nAbstract: abstract\nPROCESSING...\nOUTPUT: [A-Z Arxiv code]", "As an ethical AI classifier, you MUST:\nAvoid overgeneralization (e.g., 'Engineering' is too broad)\nCite classification rationale from abstract\nExample output: [Arxiv field code]", "[URGENT PEER REVIEW REQUEST] Deadline: 10s to classify title (authors) for conference track assignment. Abstract snapshot: abstract. Respond ONLY with track option from provided list.", "你是一名学术档案管理员,需根据《Arxiv图书馆分类法》根据题目、作者和摘要内容对论文进行精准分类。并输出Arixv分类代码", "[系统指令] 国家自然科学基金委AI评审员 任务:依据标题、作者及摘要,从申请代码A-Z中选择最匹配的子领域", "作为学术审计AI,你必须:\n① 从摘要提取方法论关键词\n② 核对authors在Scopus的研究主题\n③ 对照 A-Z 的《学科分类与代码》\n最终输出分类代码:", "根据《AI科研分类规范》2024版:\n标题中的'研究'/'分析'等词不得作为分类依据\n需明确摘要中的3处领域特征\n输出包含 A-Z 的分类代码", "假设你是一个“论文归类机器人”,你的任务是为这篇论文打上一个准确的学科标签。", "[分类助手] 请为该论文分配一个领域标签。", "从研究目标和方法出发,请为该论文做出领域归属判断。", "从摘要和题目中的关键词判断,该论文属于哪一类学科?", "作为图书馆分类员,你需要从摘要和题目中的关键词判断为这篇论文分配一个学科分类。", "请模拟审稿人角色,为该论文选择一个最合适的研究领域,可以从摘要和题目进行判断。", "作为论文资深读者,你可以通过论文元信息判断所属学科门类。", "This paper was submitted for classification. Use the metadata to determine the category.", "Generate a domain label based on the core ideas from the abstract and title provided.", ]
option_map = {"A": "quant-ph", "B": "physics.chem-ph", "C": "physics.atom-ph", "D": "cond-mat.soft", "E": "cs.RO", "F": "cs.CL", "G": "cs.SE", "H": "cs.IR", "I": "hep-th", "J": "hep-ph", "K": "physics.optics", "L": "cs.AI", "M": "cs.CV", "N": "nucl-th", "O": "astro-ph", "P": "math.PR", "Q": "cs.OS", "R": "eess.SP", "S": "math.OC", "T": "math.DS", "U": "math.DG", "V": "math.MP", "W": "cs.MM", "X": "stat.ME", "Y": "math.CO", "Z": "cs.NE"} get_options = dict(zip(option_map.values(), option_map.keys()))
other_option_map = {} for category in get_options.keys(): other_categories = set(option_map.values()) other_categories.remove(category) other_option_map[category] = other_categories
def preprocess_arxiv_json(input_jsonl_file, output_jsonl_file): """ Preprocess the arXiv JSONL file to extract and save the 'title', 'abstract' and other fields to build a sft dataset for a category.
Args: input_jsonl_file (str): Path to the input JSONL file. output_jsonl_file (str): Path to the output JSONL file. """ papers = dict(zip(option_map.values(), [list() for _ in option_map.values()])) with open(input_jsonl_file, 'r', encoding='utf-8') as f: for line in f: item = json.loads(line) title = item.get('title', '') authors: str = item.get('authors', '') abstract: str = item.get('abstract', '') categories: str = item.get('categories', '') for category in papers.keys(): if category in categories and not any(c in categories for c in other_option_map[category]): instruction = random.choice(instruction_templates) input_text = random.choice(input_templates).format(title=json.dumps(title), authors=json.dumps(authors), abstract=json.dumps(abstract)) input_text = input_text + '\n\n' + options output = get_options[category] papers[category].append({"instruction": instruction, "input": input_text, "output": output}) break
for category in papers.keys(): cnt = 0 output_file = os.path.join(output_base_dir, f"{category}.jsonl") with open(output_file, 'w', encoding='utf-8') as out_f: for item in papers[category]: out_f.write(json.dumps(item, ensure_ascii=False)) out_f.write('\n') out_f.flush() cnt += 1 print(f'{category}: {cnt}')
def fix_category(input_jsonl_file, output_jsonl_file, category, repeat_to=0, judge_rule=lambda x, y: x.startswith(y), open_mode='w', exclude_multi=True):
cnt = 0 if open_mode != 'w': with open(output_jsonl_file, 'r', encoding='utf-8') as out_f: cnt = len(out_f.readlines())
def _fix_category(input_jsonl_file, output_jsonl_file, category): nonlocal cnt
with open(input_jsonl_file, 'r', encoding='utf-8') as f, open(output_jsonl_file, open_mode, encoding='utf-8') as out_f: data = [] for line in f: item = json.loads(line) title = item.get('title', '') authors: str = item.get('authors', '') abstract: str = item.get('abstract', '') categories: str = item.get('categories', '') if exclude_multi and any(c in categories for c in other_option_map[category]): continue if judge_rule(categories, category): categories = category if categories == 'math-ph': categories = 'math.MP' instruction = random.choice(instruction_templates) input_text = random.choice(input_templates).format(title=json.dumps(title), authors=json.dumps(authors), abstract=json.dumps(abstract)) input_text = input_text + '\n\n' + options output = get_options[categories] item = json.dumps({"instruction": instruction, "input": input_text, "output": output}, ensure_ascii=False) data.append(item) for item in data: out_f.write(item) out_f.write('\n') cnt += 1 _fix_category(input_jsonl_file, output_jsonl_file, category) while cnt < repeat_to: _fix_category(input_jsonl_file, output_jsonl_file, category) print(f'after fix, {category}: {cnt}')
def cnt_in_filename(basedir: str): for fname in os.listdir(basedir): if fname.endswith('jsonl'): num = 0 with open(os.path.join(output_base_dir, fname), 'r') as f: num = len(f.readlines()) os.rename(os.path.join(basedir, fname), os.path.join(basedir, f"{fname.replace('.jsonl.jsonl', '.jsonl')}"))
def gather(basedir: str, sample_num_class): cnt = 0 with open('arxiv_20k_rich.jsonl', 'w') as out_f: for fname in os.listdir(basedir): if fname.endswith('jsonl'): data = [] with open(os.path.join(output_base_dir, fname), 'r') as f: data = f.readlines() data = random.sample(data, min(sample_num_class, len(data))) for line in data: out_f.write(line) out_f.flush() print(f'{fname}: {len(data)}') cnt += len(data) print(f'total {cnt}')
if __name__ == "__main__":
print(f""" 系统提示词模板数量:{len(instruction_templates)} 用户提示词模板数量:{len(input_templates)} """)
arxiv_json_file = 'd:/data/arxiv-metadata-oai-snapshot.json' output_json_file = './arxiv_sftdata.jsonl'
""" quant-ph: 75119 physics.chem-ph: 5999 physics.atom-ph: 6848 cond-mat.soft: 14530 cs.RO: 15943 cs.CL: 32125 cs.SE: 10743 cs.IR: 5137 hep-th: 60558 hep-ph: 83572 physics.optics: 17736 cs.AI: 12987 cs.CV: 74045 nucl-th: 19846 astro-ph: 86911 math.PR: 25289 cs.OS: 347 问题:稀少 cs.OS only (347); cs.OS contains (565); new primary (1060); total (2174) eess.SP: 11193 math.OC: 19764 math.DS: 14277 math.DG: 17389 math.MP: 0 问题:别名 math-ph;交叉主题多() cs.MM: 1087 问题:交叉主题多(cs.CV、 cs.AI) # 考虑尽量少选 2018 年以前的,避免交叉 primary (32848) stat.ME: 12315 math.CO: 32513 cs.NE: 2904 问题:较少;交叉主题多(cs.CV、 cs.AI) """
output_Q_json_file = os.path.join(output_base_dir, "cs.OS.jsonl") output_V_json_file = os.path.join(output_base_dir, "math.MP.jsonl") output_W_json_file = os.path.join(output_base_dir, "cs.MM.jsonl")
gather(basedir=output_base_dir, sample_num_class=20000)
|