Last active
June 1, 2022 20:16
-
-
Save jaclynsaunders/82719d4124390d7a0ed4df87e9342f7b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from Bio import SeqIO | |
| #As written, this stores sequences in memory. For very large files will want to | |
| #use a different Biopython parsing method. | |
| FILENAME = "input_file_to_rename.fasta" | |
| RENAME_PATTERN = "Project-name_ORF_" | |
| OUT_FILENAME = "outfile_Project_ORF_renamed" | |
| recordIx = SeqIO.parse(FILENAME, "fasta") | |
| #Generate a dictionary to store old names and new names | |
| resultsDict = {} | |
| resultsList = [] | |
| df = pd.DataFrame(columns=["Original ORF Accession", "Abbreviated ORF Accession"]) | |
| #Iterate through fasta file and create new names | |
| i=1 | |
| for record in recordIx: | |
| newID = RENAME_PATTERN + str(i) | |
| df.loc[i-1] = [record.id, newID] | |
| record.id = newID | |
| record.name = '' | |
| record.description = '' | |
| resultsList.append(record) | |
| i+=1 | |
| #Write new fasta file with abbreviated names | |
| SeqIO.write(resultsList, OUT_FILENAME + ".fasta", "fasta") | |
| #Write lookup .csv table | |
| df.to_csv(OUT_FILENAME + "_lookup_table.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment