You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
importbase64importemailfrombs4importBeautifulSoupimportpandasaspd# Path to the .mhtml filemhtml_file_path="/<!-- INSERT PATH -->/_results.mhtml"output_file_path="/<!-- INSERT PATH -->/sorted_pairs.txt"# Function to extract HTML content from .mhtml filedefextract_html_from_mhtml(file_path):
withopen(file_path, 'rb') asfile:
mhtml_content=file.read()
# Parse the mhtml contentmsg=email.message_from_bytes(mhtml_content)
# Find the HTML partforpartinmsg.walk():
ifpart.get_content_type() =="text/html":
html_content=part.get_payload(decode=True)
returnhtml_content.decode('utf-8')
returnNone# Function to parse HTML and extract table datadefparse_html_table(html_content):
soup=BeautifulSoup(html_content, 'html5lib')
table=soup.find('table')
returnpd.read_html(str(table))[0]
# Function to process and sort the table datadefprocess_and_sort_table(df):
# Extract the document names from the first row and columndocument_names=df.iloc[0, 1:].tolist()
df=df.iloc[1:, 1:]
df.columns=document_namesdf.index=document_names# Melt the DataFrame to get pairs and their similarity scoresmelted_df=df.reset_index().melt(id_vars='index', var_name='Document Pair', value_name='Similarity Score')
melted_df.columns= ['Document 1', 'Document 2', 'Similarity Score']
# Drop NaN values and convert similarity scores to numericmelted_df=melted_df.dropna()
melted_df['Similarity Score'] =pd.to_numeric(melted_df['Similarity Score'], errors='coerce')
# Sort the DataFrame by similarity scoressorted_df=melted_df.sort_values(by='Similarity Score', ascending=False)
returnsorted_df# Function to write the sorted pairs to a text filedefwrite_sorted_pairs_to_file(sorted_df, file_path):
withopen(file_path, 'w', encoding='utf-8') asfile:
forindex, rowinsorted_df.iterrows():
file.write(f"Pair: <!__{row['Document 1']}__> - <!__{row['Document 2']}__>, Similarity Score: {row['Similarity Score']}\n")
# Main script executionhtml_content=extract_html_from_mhtml(mhtml_file_path)
ifhtml_content:
df=parse_html_table(html_content)
sorted_df=process_and_sort_table(df)
write_sorted_pairs_to_file(sorted_df, output_file_path)
print(f"Sorted pairs have been written to {output_file_path}")
else:
print("No HTML content found in the .mhtml file.")
Grouping (pdf2folders.py):
importosimportshutilimportredefclean_filename(filename):
"""Removes special characters and converts spaces to underscores."""returnre.sub(r'[^\w\s-]', '', filename).replace(' ', '_')
deffind_pdf(pdf_name, pdf_dir):
"""Searches for a PDF file in the directory that matches the given name."""forfilenameinos.listdir(pdf_dir):
cleaned_filename=clean_filename(filename).lower()
ifcleaned_filename==clean_filename(pdf_name).lower():
returnos.path.join(pdf_dir, filename)
returnNonedefcreate_and_move(pairs_file, pdf_dir):
""" Creates folders for each PDF pair and moves the PDFs into them. Args: pairs_file (str): Path to the file containing the PDF pairs. pdf_dir (str): Path to the directory containing the PDFs. """folder_count=1withopen(pairs_file, 'r') asf:
forlineinf:
match=re.match(r"Pair: <!\s*(.*?)\s*> - <!\s*(.*?)\s*>, Similarity Score.*", line)
ifmatch:
pdf1_name=match.group(1).strip()
pdf2_name=match.group(2).strip()
# Remove leading and trailing underscorespdf1_name=pdf1_name.strip('_') +".pdf"pdf2_name=pdf2_name.strip('_') +".pdf"# Find the actual PDF filespdf1_path=find_pdf(pdf1_name, pdf_dir)
pdf2_path=find_pdf(pdf2_name, pdf_dir)
ifnotpdf1_path:
print(f"Warning: Could not find PDF: {pdf1_name}")
ifnotpdf2_path:
print(f"Warning: Could not find PDF: {pdf2_name}")
ifpdf1_pathandpdf2_path:
# Create a folder name using numeric notationfolder_name=f"folder{folder_count:03d}"folder_path=os.path.join(pdf_dir, folder_name)
# Create the folder if it doesn't existos.makedirs(folder_path, exist_ok=True)
# Move the PDFs into the foldertry:
shutil.move(pdf1_path, folder_path)
shutil.move(pdf2_path, folder_path)
print(f"Moved {pdf1_name} and {pdf2_name} to {folder_path}")
folder_count+=1exceptExceptionase:
print(f"Error moving files: {e}")
if__name__=="__main__":
pairs_file="/<!-- INSERT PATH -->/sorted_pairs.txt"# Replace with the actual path to your pairs filepdf_dir="/<!-- INSERT PATH -->/030.REFS"# Replace with the actual path to your PDF directorycreate_and_move(pairs_file, pdf_dir)
Removing one of the pair (rmSMALLpdf.py):
importosdefdelete_smaller_pdf(folder_path):
""" Deletes the smaller PDF in the given folder. Args: folder_path (str): Path to the folder containing the PDFs. """pdf_files= [fforfinos.listdir(folder_path) iff.endswith('.pdf')]
iflen(pdf_files) !=2:
print(f"Skipping folder {folder_path} as it does not contain exactly two PDFs.")
returnpdf1_path=os.path.join(folder_path, pdf_files[0])
pdf2_path=os.path.join(folder_path, pdf_files[1])
pdf1_size=os.path.getsize(pdf1_path)
pdf2_size=os.path.getsize(pdf2_path)
ifpdf1_size<pdf2_size:
os.remove(pdf1_path)
print(f"Deleted smaller PDF: {pdf1_path}")
else:
os.remove(pdf2_path)
print(f"Deleted smaller PDF: {pdf2_path}")
defiterate_folders(base_dir):
""" Iterates through each folder in the base directory and deletes the smaller PDF. Args: base_dir (str): Path to the base directory containing the folders. """forroot, dirs, filesinos.walk(base_dir):
fordir_nameindirs:
folder_path=os.path.join(root, dir_name)
delete_smaller_pdf(folder_path)
if__name__=="__main__":
base_dir="/<!-- INSERT PATH -->/030.REFS"# Replace with the actual path to your base directoryiterate_folders(base_dir)
The text was updated successfully, but these errors were encountered:
First, congratz for the awesome work, keep it up!
Now, i've made some python scripts, if it serves you or the project, feel free to merge, here they are:
https://ib.bsb.br/copy-spotter
Ordering (sort-SIMILAR-pairs.py):
Grouping (pdf2folders.py):
Removing one of the pair (rmSMALLpdf.py):
The text was updated successfully, but these errors were encountered: