You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

25 lines
728 B

# convert the urls in to json format, and save it to urls.json, colapse the same urls, remove the last `/` in the url
import json
import re
# Read the content of
with open("", "r", encoding="utf-8") as file:
content =
# Find all URLs in the content []
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?=\])', content)
# Remove the last '/' in the URL and collapse the same URLs
unique_urls = []
for url in urls:
url = url[:-1] if url.endswith('/') else url
if url not in unique_urls:
# Save the URLs to urls.json
with open("urls.json", "w") as file:
json.dump(unique_urls, file)