pyvcf是一个用于处理VCF(Variant Call Format)文件的 Python 库。VCF文件是一种存储基因变异信息的文本格式,pyvcf库提供了便捷的接口来读取、写入和操作VCF文件中的数据。
1. 安装pyvcf
pip install pyvcf
2. 使用pyvcf库读取VCF文件
import vcf
# 定义读取的vcf文件路径
input_vcf_path = 'sample.vcf'
# 读取VCF文件
vcf_reader = vcf.Reader(open(input_vcf_path, 'r'))
# 获取VCF头信息
vcf_metadata = vcf_reader.metadata
vcf_filters = vcf_reader.filters
vcf_formats = vcf_reader.formats
vcf_infos = vcf_reader.infos
# 获取样本列表
samples = vcf_reader.samples
print('*' * 15 + 'vcf_metadata' + '*' * 15)
for key, value in vcf_metadata.items():
print(f"{key}: {value}")
print('\n')
print('*' * 15 + 'vcf_filters' + '*' * 15)
for key, value in vcf_filters.items():
print(f"{key}: {value}")
print('\n')
print('*' * 15 + 'vcf_formats' + '*' * 15)
for key, value in vcf_formats.items():
print(f"{key}: {value}")
print('\n')
print('*' * 15 + 'vcf_infos' + '*' * 15)
for key, value in vcf_infos.items():
print(f"{key}: {value}")
print('\n')
print('*' * 15 + 'Samples' + '*' * 15)
for sample in samples:
print(sample)
print('\n')
# 遍历VCF记录
for record in vcf_reader:
# 获取基本信息
# 染色体
chrom = record.CHROM
# 位置
pos = record.POS
# 参考等位基因
ref = record.REF
# 替代等位基因
alt = record.ALT
# 质量值
qual = record.QUAL
# 过滤结果
filter_result = record.FILTER
# 获取depth信息
if 'DP' in record.INFO:
depth = record.INFO['DP']
# 获取样本信息
for sample in record.samples:
sample_name = sample.sample
# 获取当前位点样本的基因型
genotype = sample['GT']
if 'DP' in sample.data._fields:
# 获取样本当前位点的测序深度
sample_depth = sample['DP']
if 'GQ' in sample.data._fields:
# 获取样本当前位点的基因型质量值
genotype_quality = sample['GQ']
# 打印一行结果, break结束循环
print(f"chrom: {chrom}, pos: {pos}, ref: {ref}, alt: {alt}, qual: {qual}, "
f"filter_result: {filter_result}, depth: {depth}, sample_name: {sample_name}, "
f"genotype: {genotype}, sample_depth: {sample_depth}, genotype_quality: {genotype_quality}")
break
3. 使用pyvcf库写入变异信息至VCF文件
import vcf
# 写入的vcf文件路径
output_vcf_path = 'output.vcf'
# 读取VCF文件
vcf_reader = vcf.Reader(open(input_vcf_path, 'r'))
# 写入VCF文件
vcf_writer = vcf.Writer(open(output_vcf_path, 'w'), vcf_reader)
# 创建新的VCF记录
new_record = vcf.model._Record(
CHROM='chr1',
POS=10000,
ID='.',
REF='A',
ALT=[vcf.model._Substitution('G')],
QUAL=None,
FILTER=None,
INFO={},
FORMAT='GT',
sample_indexes=None,
samples=None
)
# 写入记录
vcf_writer.write_record(new_record)
# 关闭写入器
vcf_writer.close()
4. 过滤 VCF 文件
# 根据染色体编号过滤,下列为提取chr1染色体的VCF记录
filtered_records = []
for record in vcf_reader:
if record.CHROM == 'chr1':
filtered_records.append(record)
# 根据基因型质量GQ和测序深度DP过滤VCF记录
filtered_records = []
for record in vcf_reader:
# 遍历全部样本
for sample in record.samples:
if 'GQ' in sample and sample['GQ'] > 20 and 'DP' in sample and sample['DP'] > 10:
filtered_records.append(record)