+Warwagon MVC Posted January 5, 2024 MVC Share Posted January 5, 2024 import wx import fitz # PyMuPDF import string import re class PDFSearchApp(wx.Frame): def __init__(self, *args, **kw): super(PDFSearchApp, self).__init__(*args, **kw) self.panel = wx.Panel(self) self.create_widgets() self.stop_search_flag = False # Flag to signal stop search self.found_results = [] # Store found results self.dictionary = set() # Set to store dictionary words def create_widgets(self): self.file_picker_names = wx.FilePickerCtrl(self.panel, message="Choose the Names TXT file:") self.file_picker_pdf = wx.FilePickerCtrl(self.panel, message="Choose the PDF file:") self.file_picker_dictionary = wx.FilePickerCtrl(self.panel, message="Choose the Dictionary TXT file:") self.search_button = wx.Button(self.panel, label="Search", size=(80, 30)) self.search_button.Bind(wx.EVT_BUTTON, self.on_search) self.stop_search_button = wx.Button(self.panel, label="Stop Search", size=(100, 30)) self.stop_search_button.Bind(wx.EVT_BUTTON, self.on_stop_search) self.result_text = wx.TextCtrl(self.panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(400, 300)) # New components for search box self.search_text = wx.TextCtrl(self.panel, size=(200, -1)) self.search_button_exact = wx.Button(self.panel, label="Search Exact", size=(120, 30)) self.search_button_exact.Bind(wx.EVT_BUTTON, self.on_search_exact) # Save Results button self.save_button = wx.Button(self.panel, label="Save Results", size=(120, 30)) self.save_button.Bind(wx.EVT_BUTTON, self.on_save_results) sizer = wx.BoxSizer(wx.VERTICAL) sizer.Add(wx.StaticText(self.panel, label="Names File:"), 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.file_picker_names, 0, wx.ALL | wx.EXPAND, 10) sizer.Add(wx.StaticText(self.panel, label="PDF File:"), 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.file_picker_pdf, 0, wx.ALL | wx.EXPAND, 10) sizer.Add(wx.StaticText(self.panel, label="Dictionary File:"), 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.file_picker_dictionary, 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.search_button, 0, wx.ALL | wx.CENTER, 10) sizer.Add(self.stop_search_button, 0, wx.ALL | wx.CENTER, 10) sizer.Add(self.result_text, 1, wx.ALL | wx.EXPAND, 10) # Search box components sizer.Add(wx.StaticText(self.panel, label="Search Exact Text:"), 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.search_text, 0, wx.ALL | wx.EXPAND, 10) sizer.Add(self.search_button_exact, 0, wx.ALL | wx.CENTER, 10) # Save Results button sizer.Add(self.save_button, 0, wx.ALL | wx.CENTER, 10) self.panel.SetSizer(sizer) def on_search(self, event): print("Search button clicked") self.stop_search_flag = False # Reset the flag self.found_results.clear() # Clear previous results names_file_path = self.file_picker_names.GetPath() pdf_file_path = self.file_picker_pdf.GetPath() dictionary_file_path = self.file_picker_dictionary.GetPath() print(f"Names File Path: {names_file_path}") print(f"PDF File Path: {pdf_file_path}") print(f"Dictionary File Path: {dictionary_file_path}") if not pdf_file_path: wx.MessageBox("Please select a PDF file.", "Error", wx.OK | wx.ICON_ERROR) return names = self.load_names(names_file_path) self.load_dictionary(dictionary_file_path) self.search_and_display_results(pdf_file_path, names) def on_stop_search(self, event): print("Stop Search button clicked") self.stop_search_flag = True # Do not save results automatically when the search is stopped def on_search_exact(self, event): print("Search Exact button clicked") self.stop_search_flag = False # Reset the flag self.found_results.clear() # Clear previous results pdf_file_path = self.file_picker_pdf.GetPath() exact_text = self.search_text.GetValue().strip() print(f"PDF File Path: {pdf_file_path}") print(f"Exact Text to Search: {exact_text}") if not pdf_file_path or not exact_text: wx.MessageBox("Please select a PDF file and enter exact text to search.", "Error", wx.OK | wx.ICON_ERROR) return self.search_exact_and_display_results(pdf_file_path, exact_text) def load_names(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: return [name.strip() for name in file] def load_dictionary(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()} def is_valid_second_word(self, word): invalid_words = {f"{i}," for i in range(1, 32)} invalid_special_chars = {'#', '&', '-', '?', '.'} cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars) return ( cleaned_word not in invalid_words and cleaned_word not in invalid_special_chars and len(cleaned_word) > 1 ) def clean_word(self, word): # Remove leading and trailing numbers or special characters cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'}) return cleaned_word.strip() def search_and_display_results(self, pdf_file_path, names): print(f"Searching PDF file: {pdf_file_path}") self.result_text.Clear() pdf_document = fitz.open(pdf_file_path) for page_number in range(pdf_document.page_count): if self.stop_search_flag: print("Search stopped.") break page = pdf_document[page_number] text = page.get_text("text") for name in names: if re.search(rf'\b{name.lower()}\b', text.lower()): if name.strip(): parts = text.lower().split(name.lower()) if len(parts) > 1: next_word_parts = parts[1].split() if next_word_parts: next_word = next_word_parts[0].rstrip(',\'') # Ignore if the length of the second word has 1 or 2 characters if len(re.sub(r'\W', '', next_word)) in {1, 2}: continue # Exclude certain patterns (e.g., domains) from being considered as names if re.match(r'\w+\.\w+', next_word): continue # Skip if the following word looks like a domain # Check if the whole word is in the dictionary if name.lower() in self.dictionary: continue # Ignore the entire name if it's in the dictionary # Check for content inside parentheses and ignore if it's in the dictionary content_in_parentheses = re.search(r'\((.*?)\)', name) if content_in_parentheses: content_word = content_in_parentheses.group(1).strip() if content_word.lower() in self.dictionary: continue # Ignore the entire name if content inside parentheses is in the dictionary # Check if the second word is all numbers if next_word.isdigit(): continue # Ignore the entire name if the second word is all numbers if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary: clean_next_word = self.clean_word(next_word) result = f"{name} {clean_next_word}, Page: {page_number + 1}\n" self.result_text.AppendText(result) self.found_results.append(result) print(f"Match found: {result}") wx.Yield() pdf_document.close() # Do not save results automatically here print("Search completed.") def search_exact_and_display_results(self, pdf_file_path, exact_text): print(f"Searching Exact Text in PDF file: {pdf_file_path}") self.result_text.Clear() pdf_document = fitz.open(pdf_file_path) for page_number in range(pdf_document.page_count): if self.stop_search_flag: print("Search stopped.") break page = pdf_document[page_number] text = page.get_text("text") if re.search(rf'\b{exact_text.lower()}\b', text.lower()): result = f"Exact Text '{exact_text}' found on Page: {page_number + 1}\n" self.result_text.AppendText(result) self.found_results.append(result) print(f"Exact Text found: {result}") wx.Yield() pdf_document.close() # Do not save results automatically here print("Search completed.") def on_save_results(self, event): print("Save Results button clicked") self.save_results_to_file_dialog() def save_results_to_file_dialog(self): dlg = wx.FileDialog( self, message="Save Results As...", defaultDir=wx.GetHomeDir(), defaultFile="results.txt", wildcard="Text files (*.txt)|*.txt|All files (*.*)|*.*", style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT ) if dlg.ShowModal() == wx.ID_OK: file_path = dlg.GetPath() self.save_results_to_file(file_path) wx.MessageBox(f"Results saved successfully to:\n{file_path}", "Info", wx.OK | wx.ICON_INFORMATION) dlg.Destroy() def save_results_to_file(self, file_path): sorted_results = sorted(self.found_results, key=lambda x: x.lower()) with open(file_path, "w", encoding="utf-8") as file: file.writelines(sorted_results) if __name__ == '__main__': app = wx.App(False) frame = PDFSearchApp(None, title='PDF Search App', size=(600, 700)) frame.Show() app.MainLoop() +Zlip792 and xrobwx71 2 Share Link to comment https://www.neowin.net/forum/topic/1437118-pdf-name-search-python-app/ Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now