Add files via upload

maliqto · Mar 11, 2024 · e3207ea · e3207ea
1 parent ca1122e
commit e3207ea
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 0 deletions.
diff --git a/apagar_indice.py b/apagar_indice.py
@@ -0,0 +1,10 @@
+from elasticsearch import Elasticsearch
+
+# Configuração do Elasticsearch
+es = Elasticsearch(['http://localhost:9200/'])
+
+# Nome do índice
+index_name = 'k1ppers'
+
+# Apagar o índice
+es.indices.delete(index=index_name)
diff --git a/indexar.py b/indexar.py
@@ -0,0 +1,81 @@
+from elasticsearch import Elasticsearch, helpers
+import hashlib
+import threading
+import re
+import os
+
+# Configuração do Elasticsearch
+es = Elasticsearch(['http://localhost:9200/'], timeout=30, max_retries=10, retry_on_timeout=True)
+
+index_name = 'k1ppers'
+
+# Lock para sincronizar o acesso aos contadores
+lock = threading.Lock()
+
+def create_index_action(line, counters):
+    global lock
+
+    cleaned_line = line.strip()
+    # Dividir a linha usando um delimitador específico que sabemos que não aparece na URL
+    parts = re.split(';|\\|', cleaned_line, maxsplit=2)
+
+    if len(parts) == 3:
+        url, username, password = parts
+        # Agora não precisamos nos preocupar em dividir a URL incorretamente
+        document = {
+            "url": url,
+            "username": username,
+            "password": password
+        }
+        doc_hash = hashlib.sha256((url + username + password).encode()).hexdigest()
+
+        if not es.exists(index=index_name, id=doc_hash):
+            action = {
+                "_op_type": "index",
+                "_index": index_name,
+                "_id": doc_hash,
+                "_source": document
+            }
+            with lock:
+                counters['added_counter'] += 1
+            return action
+        else:
+            with lock:
+                counters['duplicate_counter'] += 1
+    else:
+        with lock:
+            counters['invalid_counter'] += 1
+        print(f"Linha considerada inválida: {cleaned_line}")
+    return None
+
+def process_file(file_path, counters):
+    batch = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as txtfile:
+            for i, line in enumerate(txtfile):
+                action = create_index_action(line, counters)
+                if action:
+                    batch.append(action)
+
+                if len(batch) >= 1000:
+                    helpers.bulk(es, batch)
+                    batch.clear()
+                    print(f"Processado lote de 1000 documentos até agora.")
+
+            if batch:
+                helpers.bulk(es, batch)
+                print("Processado último lote de documentos.")
+    except Exception as e:
+        print(f"Erro ao processar arquivo {file_path}: {e}")
+
+def main(directory_path):
+    counters = {'added_counter': 0, 'duplicate_counter': 0, 'invalid_counter': 0}
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.txt'):
+            file_path = os.path.join(directory_path, filename)
+            print(f"Processando o arquivo: {filename}")
+            process_file(file_path, counters)
+    print(f"Processamento concluído. Linhas adicionadas: {counters['added_counter']}, Duplicatas: {counters['duplicate_counter']}, Inválidas: {counters['invalid_counter']}")
+
+if __name__ == "__main__":
+    main('C:/Users/vicktor/Downloads/CLOUD')  # Substitua pelo diretório de destino
diff --git a/pesquisar.py b/pesquisar.py
@@ -0,0 +1,26 @@
+from elasticsearch import Elasticsearch
+
+# Configuração do Elasticsearch
+es = Elasticsearch(['http://localhost:9200/'])
+
+# Nome do índice
+index_name = 'k1ppers'
+
+# Consulta de busca
+query = {
+    "query": {
+        "match": {
+            "url": "seguidores"
+        }
+    }
+}
+
+# Fazer a busca
+response = es.search(index=index_name, body=query)
+
+# Imprimir o número total de hits (documentos retornados)
+print(f"Número total de documentos retornados: {response['hits']['total']['value']}")
+
+# Imprimir a resposta
+for hit in response['hits']['hits']:
+    print(f"Documento ID: {hit['_id']}, Pontuação: {hit['_score']}, Documento: {hit['_source']}")
diff --git a/ver_indices.py b/ver_indices.py
@@ -0,0 +1,11 @@
+from elasticsearch import Elasticsearch
+
+# Configuração do Elasticsearch
+es = Elasticsearch(['http://localhost:9200/'])
+
+# Obter a lista de todos os índices
+all_indices = es.indices.get_alias(index="*")
+
+# Imprimir a lista de índices
+for index in all_indices:
+    print(index)