Преглед на файлове

remove porn gambling site from domain list

pull/342/head
Guillaume Vincent преди 8 години
родител
ревизия
dcd211091b
променени са 4 файла, в които са добавени 34715 реда и са изтрити 13 реда
  1. +2
    -1
      .gitignore
  2. +34673
    -0
      domains/blacklist.txt
  3. +1
    -0
      domains/domains.json
  4. +39
    -12
      domains/parse-top-domains.py

+ 2
- 1
.gitignore Целия файл

@@ -1 +1,2 @@
node_modules
node_modules
/domains/top-1m.csv

+ 34673
- 0
domains/blacklist.txt
Файловите разлики са ограничени, защото са твърде много
Целия файл


+ 1
- 0
domains/domains.json
Файловите разлики са ограничени, защото са твърде много
Целия файл


+ 39
- 12
domains/parse-top-domains.py Целия файл

@@ -1,12 +1,39 @@
# https://support.alexa.com/hc/en-us/articles/200461990-Can-I-get-a-list-of-top-sites-from-an-API-
counter = {}
with open('source.csv') as source, open('destination.csv', 'w') as destination:
destination.write('[')
for line in source.readlines():
line = line.strip()
ext = line.split('.')[-1]
if ext in ['com', 'fr', 'org', 'net', 'eu']:
counter[ext] = counter.get(ext, 0) + 1
if ext == 'com' and counter[ext] < 200 or counter[ext] < 100:
destination.write('"%s",' % line.split(',')[-1])
destination.write(']')
# top-1m.csv get from https://support.alexa.com/hc/en-us/articles/200461990-Can-I-get-a-list-of-top-sites-from-an-API-
import json
import os


def get_domains():
domains = ['lesspass.com']
counter = {}
with open('top-1m.csv') as source:
for line in source.readlines():
line = line.strip()
ext = line.split('.')[-1]
if ext in ['com', 'fr', 'org', 'net', 'eu']:
counter[ext] = counter.get(ext, 0) + 1
if ext in ['com', 'fr'] and counter[ext] < 200 or counter[ext] < 100:
domains.append(line.split(',')[-1])
return domains


def get_blacklist_domains():
blacklist_domains = []
with open('blacklist.txt') as source:
for line in source.readlines():
line = line.strip()
if line and not line.startswith('#'):
blacklist_domains.append(line.split()[1])
return blacklist_domains


def save_good_domains_in_json(domains, blacklist_domains):
with open('domains.json', 'w') as destination:
destination.write(json.dumps([x for x in domains if x not in blacklist_domains]))


if os.path.exists('top-1m.csv'):
save_good_domains_in_json(get_domains(), get_blacklist_domains())
else:
print('need top-1m.csv get from '
'https://support.alexa.com/hc/en-us/articles/200461990-Can-I-get-a-list-of-top-sites-from-an-API-')

Зареждане…
Отказ
Запис