How to Create a Bioinformatics AI Agent Using Biopython for DNA and Protein Analysis

class BioPythonAIAgent:
def __init__(self, electronic mail=”[email protected]”):
self.electronic mail = electronic mail
Entrez.electronic mail = electronic mail
self.sequences = {}
self.analysis_results = {}
self.alignments = {}
self.bushes = {}

def fetch_sequence_from_ncbi(self, accession_id, db=”nucleotide”, rettype=”fasta”):
strive:
deal with = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode=”textual content”)
report = SeqIO.learn(deal with, “fasta”)
deal with.shut()
self.sequences[accession_id] = report
return report
besides Exception as e:
print(f”Error fetching sequence: {str(e)}”)
return None

def create_sample_sequences(self):
covid_spike = “MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT”

human_insulin = “MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN”

e_coli_16s = “AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAATGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGCGTTAAGGTTAATAACCTTGGCGATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA”

sample_sequences = [
(“COVID_Spike”, covid_spike, “SARS-CoV-2 Spike Protein”),
(“Human_Insulin”, human_insulin, “Human Insulin Precursor”),
(“E_coli_16S”, e_coli_16s, “E. coli 16S rRNA”)
]

for seq_id, seq_str, desc in sample_sequences:
report = SeqRecord(Seq(seq_str), id=seq_id, description=desc)
self.sequences[seq_id] = report

return sample_sequences

def analyze_sequence(self, sequence_id=None, sequence=None):
if sequence_id and sequence_id in self.sequences:
seq_record = self.sequences[sequence_id]
seq = seq_record.seq
description = seq_record.description
elif sequence:
seq = Seq(sequence)
description = “Customized sequence”
else:
return None

evaluation = {
‘size’: len(seq),
‘composition’: {}
}

for base in [‘A’, ‘T’, ‘G’, ‘C’]:
evaluation[‘composition’][base] = seq.rely(base)

if ‘A’ in evaluation[‘composition’] and ‘T’ in evaluation[‘composition’]:
evaluation[‘gc_content’] = spherical(gc_fraction(seq) * 100, 2)
strive:
evaluation[‘molecular_weight’] = spherical(molecular_weight(seq, seq_type=”DNA”), 2)
besides:
evaluation[‘molecular_weight’] = len(seq) * 650

strive:
if len(seq) % 3 == 0:
protein = seq.translate()
evaluation[‘translation’] = str(protein)
evaluation[‘stop_codons’] = protein.rely(‘*’)

if ‘*’ not in str(protein)[:-1]:
prot_analysis = ProteinAnalysis(str(protein)[:-1])
evaluation[‘protein_mw’] = spherical(prot_analysis.molecular_weight(), 2)
evaluation[‘isoelectric_point’] = spherical(prot_analysis.isoelectric_point(), 2)
evaluation[‘protein_composition’] = prot_analysis.get_amino_acids_percent()
besides:
move

key = sequence_id if sequence_id else “customized”
self.analysis_results[key] = evaluation

return evaluation

def visualize_composition(self, sequence_id):
if sequence_id not in self.analysis_results:
return

evaluation = self.analysis_results[sequence_id]

fig = make_subplots(
rows=2, cols=2,
specs=[[{“type”: “pie”}, {“type”: “bar”}],
[{“colspan”: 2}, None]],
subplot_titles=(“Nucleotide Composition”, “Base Rely”, “Sequence Properties”)
)

labels = record(evaluation[‘composition’].keys())
values = record(evaluation[‘composition’].values())

fig.add_trace(
go.Pie(labels=labels, values=values, title=”Composition”),
row=1, col=1
)

fig.add_trace(
go.Bar(x=labels, y=values, title=”Rely”, marker_color=[‘red’, ‘blue’, ‘green’, ‘orange’]),
row=1, col=2
)

properties = [‘Length’, ‘GC%’, ‘MW (kDa)’]
prop_values = [
analysis[‘length’],
evaluation.get(‘gc_content’, 0),
evaluation.get(‘molecular_weight’, 0) / 1000
]

fig.add_trace(
go.Scatter(x=properties, y=prop_values, mode=”markers+traces”,
marker=dict(dimension=10, coloration=”purple”), title=”Properties”),
row=2, col=1
)

fig.update_layout(
title=f”Complete Evaluation: {sequence_id}”,
showlegend=False,
top=600
)

fig.present()

def perform_multiple_sequence_alignment(self, sequence_ids):
if len(sequence_ids) < 2:
return None

sequences = []
for seq_id in sequence_ids:
if seq_id in self.sequences:
sequences.append(self.sequences[seq_id])

if len(sequences) < 2:
return None

from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.open_gap_score = -2
aligner.extend_gap_score = -0.5

alignments = []
for i in vary(len(sequences)):
for j in vary(i+1, len(sequences)):
alignment = aligner.align(sequences[i].seq, sequences[j].seq)[0]
alignments.append(alignment)

return alignments

def create_phylogenetic_tree(self, alignment_key=None, sequences=None):
if alignment_key and alignment_key in self.alignments:
alignment = self.alignments[alignment_key]
elif sequences:
data = []
for i, seq in enumerate(sequences):
report = SeqRecord(Seq(seq), id=f”seq_{i}”)
data.append(report)
SeqIO.write(data, “temp.fasta”, “fasta”)

strive:
clustalw_cline = ClustalwCommandline(“clustalw2″, infile=”temp.fasta”)
stdout, stderr = clustalw_cline()
alignment = AlignIO.learn(“temp.aln”, “clustal”)
os.take away(“temp.fasta”)
os.take away(“temp.aln”)
os.take away(“temp.dnd”)
besides:
return None
else:
return None

calculator = DistanceCalculator(‘id’)
dm = calculator.get_distance(alignment)

constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)

tree_key = f”tree_{len(self.bushes)}”
self.bushes[tree_key] = tree

return tree

def visualize_tree(self, tree):
fig, ax = plt.subplots(figsize=(10, 6))
Phylo.draw(tree, axes=ax)
plt.title(“Phylogenetic Tree”)
plt.tight_layout()
plt.present()

def protein_structure_analysis(self, sequence_id):
if sequence_id not in self.sequences:
return None

seq = self.sequences[sequence_id].seq

strive:
if len(seq) % 3 == 0:
protein = seq.translate()
if ‘*’ not in str(protein)[:-1]:
prot_analysis = ProteinAnalysis(str(protein)[:-1])

structure_analysis = {
‘molecular_weight’: prot_analysis.molecular_weight(),
‘isoelectric_point’: prot_analysis.isoelectric_point(),
‘amino_acid_percent’: prot_analysis.get_amino_acids_percent(),
‘secondary_structure’: prot_analysis.secondary_structure_fraction(),
‘flexibility’: prot_analysis.flexibility(),
‘gravy’: prot_analysis.gravy()
}

return structure_analysis
besides:
move

return None

def comparative_analysis(self, sequence_ids):
outcomes = []

for seq_id in sequence_ids:
if seq_id in self.analysis_results:
evaluation = self.analysis_results[seq_id].copy()
evaluation[‘sequence_id’] = seq_id
outcomes.append(evaluation)

df = pd.DataFrame(outcomes)

if len(df) > 1:
fig = make_subplots(
rows=2, cols=2,
subplot_titles=(“Size Comparability”, “GC Content material”, “Molecular Weight”, “Composition Heatmap”)
)

fig.add_trace(
go.Bar(x=df[‘sequence_id’], y=df[‘length’], title=”Size”),
row=1, col=1
)

if ‘gc_content’ in df.columns:
fig.add_trace(
go.Scatter(x=df[‘sequence_id’], y=df[‘gc_content’], mode=”markers+traces”, title=”GC%”),
row=1, col=2
)

if ‘molecular_weight’ in df.columns:
fig.add_trace(
go.Bar(x=df[‘sequence_id’], y=df[‘molecular_weight’], title=”MW”),
row=2, col=1
)

fig.update_layout(title=”Comparative Sequence Evaluation”, top=600)
fig.present()

return df

def codon_usage_analysis(self, sequence_id):
if sequence_id not in self.sequences:
return None

seq = self.sequences[sequence_id].seq

if len(seq) % 3 != 0:
return None

codons = {}
for i in vary(0, len(seq) – 2, 3):
codon = str(seq[i:i+3])
codons[codon] = codons.get(codon, 0) + 1

codon_df = pd.DataFrame(record(codons.gadgets()), columns=[‘Codon’, ‘Count’])
codon_df = codon_df.sort_values(‘Rely’, ascending=False)

fig = px.bar(codon_df.head(20), x=’Codon’, y=’Rely’,
title=f”High 20 Codon Utilization – {sequence_id}”)
fig.present()

return codon_df

def motif_search(self, sequence_id, motif_pattern):
if sequence_id not in self.sequences:
return []

seq = str(self.sequences[sequence_id].seq)
positions = []

for i in vary(len(seq) – len(motif_pattern) + 1):
if seq[i:i+len(motif_pattern)] == motif_pattern:
positions.append(i)

return positions

def gc_content_window(self, sequence_id, window_size=100):
if sequence_id not in self.sequences:
return None

seq = self.sequences[sequence_id].seq
gc_values = []
positions = []

for i in vary(0, len(seq) – window_size + 1, window_size//4):
window = seq[i:i+window_size]
gc_values.append(gc_fraction(window) * 100)
positions.append(i + window_size//2)

fig = go.Determine()
fig.add_trace(go.Scatter(x=positions, y=gc_values, mode=”traces+markers”,
title=f’GC Content material (window={window_size})’))
fig.update_layout(
title=f”GC Content material Sliding Window Evaluation – {sequence_id}”,
xaxis_title=”Place”,
yaxis_title=”GC Content material (%)”
)
fig.present()

return positions, gc_values

def run_comprehensive_analysis(self, sequence_ids):
outcomes = {}

for seq_id in sequence_ids:
if seq_id in self.sequences:
evaluation = self.analyze_sequence(seq_id)
self.visualize_composition(seq_id)

gc_analysis = self.gc_content_window(seq_id)
codon_analysis = self.codon_usage_analysis(seq_id)

outcomes[seq_id] = {
‘basic_analysis’: evaluation,
‘gc_window’: gc_analysis,
‘codon_usage’: codon_analysis
}

if len(sequence_ids) > 1:
comparative_df = self.comparative_analysis(sequence_ids)
outcomes[‘comparative’] = comparative_df

return outcomes

How to Create a Bioinformatics AI Agent Using Biopython for DNA and Protein Analysis

Leave a Reply Cancel reply

Follow US

Popular News

United Arab Emirates leaving OPEC, effective May 1

Expert Reveals Why Stars Like George Clooney Have Left Hollywood For France

Microsoft hosts emergency press conference after protesters ‘storm a building’

2+2=5′ Leads Critics Choice Documentary Awards Nominations

The 2026 Time Series Toolkit: 5 Foundation Models for Autonomous Forecasting

Categories

About US

Quick Links

Important Links

Subscribe US