K-Anonymity 数据匿名化系统 - 使用 Python 和 Tkinter
import pandas as pd from tkinter import * from tkinter import filedialog
class KAnonymitySystem:
def __init__(self, root):
self.root = root
self.root.title('K-Anonymity System')
self.root.geometry('500x500')
self.file_path = ''
self.df = None
self.sensitive_attributes = []
self.k = 0
self.anonymized_df = None
self.file_label = Label(self.root, text='未查找到文件.')
self.file_label.pack()
self.select_file_button = Button(self.root, text='插入文件', command=self.select_file)
self.select_file_button.pack()
self.preprocess_button = Button(self.root, text='运行程序', command=self.preprocess_data, state=DISABLED)
self.preprocess_button.pack()
self.sensitivity_label = Label(self.root, text='输入敏感数据 (逗号分割):', state=DISABLED)
self.sensitivity_label.pack()
self.sensitivity_entry = Entry(self.root, state=DISABLED)
self.sensitivity_entry.pack()
self.k_label = Label(self.root, text='输入k值:', state=DISABLED)
self.k_label.pack()
self.k_entry = Entry(self.root, state=DISABLED)
self.k_entry.pack()
self.anonymize_button = Button(self.root, text='匿名数据集', command=self.anonymize_data, state=DISABLED)
self.anonymize_button.pack()
self.save_button = Button(self.root, text='保存匿名数据集', command=self.save_anonymized_data, state=DISABLED)
self.save_button.pack()
self.result_text = Text(self.root, state=DISABLED, height=20)
self.result_text.pack()
def select_file(self):
self.file_path = filedialog.askopenfilename(filetypes=[('CSV Files', '*.csv')])
self.file_label.config(text=self.file_path)
self.preprocess_button.config(state=NORMAL)
def preprocess_data(self):
self.df = pd.read_csv(self.file_path)
self.df.drop_duplicates(inplace=True)
self.df.dropna(inplace=True)
self.sensitivity_label.config(state=NORMAL)
self.sensitivity_entry.config(state=NORMAL)
self.k_label.config(state=NORMAL)
self.k_entry.config(state=NORMAL)
self.anonymize_button.config(state=NORMAL)
def anonymize_data(self):
self.sensitive_attributes = self.sensitivity_entry.get().split(',')
self.k = int(self.k_entry.get())
self.anonymized_df = pd.DataFrame(columns=self.df.columns)
for group_name, group_data in self.df.groupby(self.sensitive_attributes):
if len(group_data) < self.k:
self.anonymized_df = pd.concat([self.anonymized_df, group_data])
else:
group_data_copy = group_data.copy()
group_data_copy.drop(self.sensitive_attributes, axis=1, inplace=True)
group_data_copy.drop_duplicates(inplace=True)
group_data_copy = self.generalize_data(group_data_copy)
group_data_copy['cluster'] = pd.cut(group_data_copy.index, bins=self.k, labels=False)
group_data_copy['cluster'] += min(group_data_copy['cluster'])
group_data_copy = pd.concat([group_data[self.sensitive_attributes], group_data_copy], axis=1)
self.anonymized_df = pd.concat([self.anonymized_df, group_data_copy])
accuracy = self.calculate_accuracy()
self.result_text.config(state=NORMAL)
self.result_text.delete(1.0, END)
self.result_text.insert(END, 'Anonymized Data:
') self.result_text.insert(END, self.anonymized_df.to_string(index=False)) self.result_text.insert(END, f'
Accuracy: {accuracy}') self.result_text.config(state=DISABLED) self.save_button.config(state=NORMAL)
def save_anonymized_data(self):
save_path = filedialog.asksaveasfilename(defaultextension='.csv', filetypes=[('CSV Files', '*.csv')])
self.anonymized_df.to_csv(save_path, index=False)
def generalize_data(self, group_data):
for column in group_data.columns:
column_data = group_data[column]
if column_data.dtype == 'object':
group_data[column] = column_data.str[:-1] + '*'
else:
min_value = column_data.min()
max_value = column_data.max()
step = (max_value - min_value) / 10
group_data[column] = pd.cut(column_data, bins=[min_value + step*i for i in range(11)], labels=[f'{min_value + step*i:.2f}-{min_value + step*(i+1):.2f}' for i in range(10)])
return group_data
def calculate_accuracy(self):
accuracy_df = self.df.groupby(self.sensitive_attributes).size().reset_index(name='actual_count')
accuracy_df = pd.merge(accuracy_df, self.anonymized_df.groupby(self.sensitive_attributes).size().reset_index(name='anonymized_count'), on=self.sensitive_attributes, how='left')
accuracy_df.fillna(0, inplace=True)
accuracy_df['accuracy'] = accuracy_df['anonymized_count'] / accuracy_df['actual_count']
accuracy = accuracy_df['accuracy'].mean()
return accuracy
root = Tk() k_anonymity_system = KAnonymitySystem(root) root.mainloop()
原文地址: https://www.cveoy.top/t/topic/ojMG 著作权归作者所有。请勿转载和采集!