1.最近帮医院医生处理原始MRI影像数据时,遇到了一些问题,写此文章简单记录下。2.医生的要求是把每个病人的MRI影像特定的序列(T2、DWI、DCE增强序列)的影像数据的特定区域分割出来;并按照序列由原来的dicom格式存储成NIFTI(.nii)格式。
从医院拷贝出来的原始数据存在以下几种问题
1)数据格式非常混乱
2)有些dicom文件无法读取
3)部分文件夹为空
4)部分dicom文件大小为0KB
5)掺杂其他模态的影像数据eg:钼靶影像数据
6)MRI文件夹名字无规则(命名方式不易进行后处理)
7)所有dicom文件混乱在一起没有按序列存储
面对这么“脏”的数据,拿到那一刻感觉要崩了,,,不过还好之前写过类似的python脚本,修改下可以用来处理。以下便是预处理阶段的python代码
1.删除原始数据中的空文件以及空文件夹
import os
def del_empty(path):
for (root, dirs, files) in os.walk(path):
for item in files:
path = os.path.join(root, item)
size = os.path.getsize(path)
if size == 0:
os.remove(path)
print('成功删除文件' + path)
for item in dirs:
path = os.path.join(root, item)
try:
os.removedirs(path)
print('成功删除文件夹' + path)
except Exception as e:
print('Exception',e)
定义路径,调用函数执行
path1 = r'G:\2020年10月需要去皮肤数据\1'
del_empty(path1)
2.删除包含MG(钼靶)的文件夹
最好把原始数据进行备份后再执行此步骤
import shutil
def delete_MG(folder):
for every_study in os.listdir(folder):
if os.path.isdir(os.path.join(folder,every_study)):
next_folder = os.path.join(folder,every_study)
files = os.listdir(next_folder)
for i ,f in enumerate(files):
if f.find("MG")>=0:
if os.path.isdir(os.path.join(next_folder,f)):
shutil.rmtree(os.path.join(next_folder,f))
print(i,f,os.path.join(next_folder,f))
else:
os.remove(os.path.join(next_folder,f))
delete_MG(r"G:\去皮肤数据Copy\2")
3.重命名病历号以及MRI文件夹名字
import glob
import SimpleITK as sitk #Note:注意SimpleITK不支持中文,即路径中不能有中文
import os
import pandas as pd
import pydicom
#重命名文件夹
def rename_study(folderPath):
count_study = 0
for every_study in os.listdir(folderPath):
new_studyname = '00'+every_study
#给每个收study加上00
source_path = os.path.join(folderPath,every_study)
new_path =os.path.join(folderPath,new_studyname)
print(source_path )
print('..rename to.. ')
print(new_path )
os.rename(source_path ,new_path )
count_study+=1
print('共有',count_study,'个study')
path = r"G:\去皮肤数据Copy\2"
rename_study(path)
函数功能:遍历每个study下的MR文件,并把文件名命名为dicom中的日期
def get_Dcm_info(folderPath):
count_study = 0
for every_study in os.listdir(folderPath):
count_study +=1
last_paths = glob.glob(os.path.join(folderPath,every_study,"MR","**"))
dcm_paths = []#定义个list存放所有MRI文件的路径
for every_last_path in last_paths:
print('oldDir',every_last_path) #原始文件夹路径名称
all_dicoms = os.listdir(every_last_path)
for i in range(len(all_dicoms)+1 - len(all_dicoms)):#只遍历其中一个dicom文件,就可以获取dicom信息
dcm = pydicom.read_file(os.path.join(every_last_path,all_dicoms[i]))
studyUid,PatientName,StudyDate = dcm.StudyID,dcm.PatientName,dcm.StudyDate#获取dcm信息
new_path = "\\".join(every_last_path.split("\\")[:-1])+"\\"+StudyDate #新的文件夹路径名称
print('newDir',new_path)
os.rename(every_last_path, new_path)
continue#只读一个dicom文件,这行代码好像多余了
print('共有',count_study,'个study')
preprocessFolder =r'G:\去皮肤数据Copy\2' #定义路径
get_Dcm_info(preprocessFolder)#调用函数
4.查看所有病例MRI都有哪些序列
import re
import time
import json
#函数功能:读取每个study下不同时间的dicom文件信息,并存入sion
def get_dcmSeries_info(folderPath,saveJsonPath):
count_study = 0
total_List = []
for every_study in os.listdir(folderPath):
count_study +=1
MRI_list = os.listdir(os.path.join(folderPath,every_study,"MR"))
tmp_allSeries_dict = {}
tmp_allSeries_List =[]
for s in range(len(MRI_list)):
tmp_MRI= os.path.join(folderPath,every_study,"MR",MRI_list[s])
tmp_MRI_dicoms = os.listdir(tmp_MRI)
tmp_ser_names = []
tmp_dict = {}
for j in range(len(tmp_MRI_dicoms)):
dcm = pydicom.read_file(os.path.join(tmp_MRI,tmp_MRI_dicoms[j]))
SeriesDescription = dcm.SeriesDescription
tmp_ser_names.append(SeriesDescription)
tmp_ser_names = list(set(tmp_ser_names))#对列表中的元素进行去重
tmp_dict[MRI_list[s]] = tmp_ser_names
tmp_allSeries_List.append(tmp_dict)
tmp_allSeries_dict[every_study] = tmp_allSeries_List
total_List.append(tmp_allSeries_dict)
with open(saveJsonPath,'a') as f:#数据写入json文件,a表示循环写入,不覆盖之前的内容
json.dump(total_List,f ,indent = 4)
print('共有',count_study,'个study')
定义文件路径以及存储路径,并进行文件夹遍历
begin_time = time.time()
MRfolder_path =r'F:\预处理后需要去皮肤的数据'
saveJsonPath = "F:\\预处理后需要去皮肤的数据\\Series_result.json"
for item in os.listdir(MRfolder_path):
every_path = os.path.join(MRfolder_path,item)
if os.path.isdir(every_path):#判断是否是文件夹
get_dcmSeries_info(every_path,saveJsonPath)#调用函数
else:
print(MRfolder_path,"is not dir")
end_time = time.time()
print('run code total needs ',end_time - begin_time,' s')
程序执行过程如下:会发现需要的时间还是很长的,因为需要遍历每个MRI的每张dicom文件,如果哪位大佬由更好的方法欢迎批评指正。
执行完后会生成一个json文件,里面存放的是每个MRI文件夹下的所有序列名字,有这个我们可以知道此病人做了磁共振检查的什么序列。如下图所示:
5.对想要的DCE序列进行分类,并进行重命名
吐槽下:后来快做完了,才知道又要其他序列的(T2,DWI),,,,
不过思路都是差不多的,先找到对应序列的所有名字存在一个list中;然后遍历每一张dicom,存到对应以SeriesDescription命名的文件夹下
DCE_Series = ['t1_fl3d_tra_dyna_1+5','t1_fl3d_tra_dyna_1+5_NEW','fl3d_dynamic_1-pre_3-post','fl3d_dynamic_1-pre_6-post','t1_fl3d_tra_fs_1+5_p2']
def dicomSeriesClassifier(folderPath,savefolder):
count_study = 0
for every_study in os.listdir(folderPath):
count_study +=1
tmp_path = os.path.join(folderPath,every_study)
tmp_save_folder = os.path.join(savefolder,every_study)
dcm_paths = glob.glob(os.path.join(tmp_path,"MR","**","**.dcm"))
for i in range(len(dcm_paths)):
size = os.path.getsize(dcm_paths[i])
dcm = pydicom.read_file(dcm_paths[i])
saveFolderName = os.path.join(tmp_save_folder,"DCEMR",dcm_paths[i].split("\\")[-2])
seriesName,seriestime = dcm.SeriesDescription,dcm.SeriesTime
seriestime = str(seriestime).split(".")[0]#同一个序列的seriestime
source_File = dcm_paths[i].replace("/",'\\')
if seriesName in DCE_Series:#找到DCE序列
tmp_save_Folder = os.path.join(saveFolderName,str(int(seriestime)))
if not os.path.exists(tmp_save_Folder):
os.makedirs(tmp_save_Folder)
shutil.copy(source_File,tmp_save_Folder)
print(count_study,"个病例MRI序列分类完成")
--------调用函数执行
folderPath1 = r"G:\去皮肤数据Copy\1"
savefolder = r'G:\去皮肤数据Copy\汇总分类1'
dicomSeriesClassifier(folderPath1,savefolder)
6.对每个DCE序列根据序列时间排序并进行重命名
(DCE序列一般有6个序列,是病人注射对比剂后不同时间下的成像,一般情况下恶性肿瘤的影像在不同时间信号强度也不一样)
def renameDCESeries(folderPath):
count_study = 0
for every_study in os.listdir(folderPath):
count_study +=1
tmp_path = os.path.join(folderPath,every_study)
find_folderPath = glob.glob(os.path.join(tmp_path,"DCEMR","**"))
for j in range(len(find_folderPath)):
temp_DCE_number = []
for every_DCE in os.listdir(find_folderPath[j]):
temp_DCE_number.append(every_DCE)
temp_DCE_number.sort()#对list中的元素进行排序
for t in range(len(temp_DCE_number)):
oldDirName = os.path.join(find_folderPath[j],str(temp_DCE_number[t]))
newDirName = os.path.join(find_folderPath[j],"DCE0000"+str(t+1))
if not os.path.exists(newDirName):
os.rename(oldDirName, newDirName)
else:
print(newDirName,"have alredy exists")
print("已重命名",count_study,"个study")
#调用函数执行
renameDCESeries(r"G:\去皮肤数据Copy\汇总分类1")
上面的程序执行完后的效果如下图:
7.对每个DCE序列的dicom文件按InstanceNumber重命名
InstanceNumber为dicom文件的tag信息,用microdicom或者Radiant DICOM Viewer打开影像文件时可以看到
def rename_dicom(folderPath):
count_study = 0
for every_study in os.listdir(folderPath):
count_study +=1
tmp_path = os.path.join(folderPath,every_study)
dcm_paths = glob.glob(os.path.join(tmp_path,"DCEMR","**",'**',"**.dcm"))
for i in range(len(dcm_paths)):
dcm = pydicom.read_file(dcm_paths[i])
seriesUid,InstanceNumber = dcm.SeriesInstanceUID,dcm.InstanceNumber
oldFileName = dcm_paths[i]
if InstanceNumber <10:
InstanceNumber = '0000'+str(InstanceNumber)
elif InstanceNumber <100:
InstanceNumber = '000'+str(InstanceNumber)
elif InstanceNumber <200:
InstanceNumber = '00'+str(InstanceNumber)
else:
print('Warning!!',InstanceNumber,"dicom数量大于200")
newFileName = os.path.join("\\".join(dcm_paths[i].split("\\")[:-1]),InstanceNumber+".dcm")
os.rename(oldFileName, newFileName)
print(count_study,"个病例DCE-MRI序列dicom重命名完成")
folderPath=r"G:\去皮肤数据Copy\汇总分类1"
rename_dicom(folderPath)
上面的程序执行完后的效果如下图(每张dicom按照顺序存储,命名方式也易于读取):
对于乳腺皮肤如何去掉和把分割后的数据转成nii格式,后面将详细阐述
python医学影像2Ddicom文件转成3Dnii文件(保留原始dicom信息)
说明:本文为原创文章,转载或引用请注明网址和标题;有不正确的地方欢迎批评指正nick.yu.jd@qq.com