'Scrapy Duplicate Header in CSV File
I am extracting data from a website (https://www.bernama.com/en/crime_courts/) into CSV format using Scrapy. I aim to append the data into the file instead of overwrite it. However, it returns duplicate headers in the CSV file every time I run the program. May I know any solution to keep the first row headers while removing the other rows headers?
Duplicate Header in CSV File Output: Output Image
Pipeline.py
import json
import sys
import mysql.connector
from mysql.connector import errorcode
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.exporters import CsvItemExporter
from scrapy import signals
class NewsPipeline:
def __init__(self):
self.create_connection()
self.create_table()
self.files = {}
#duplicate item
self.ids_seen = set()
def create_connection(self):
try:
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = '',
port = 3306,
database = 'crime_news'
)
except mysql.Error as e:
print(f"Error connecting to DB Platform: {e}")
sys.exit(1)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news""")
self.curr.execute("""CREATE TABLE news (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(150) NOT NULL,
link VARCHAR(200) NOT NULL,
date VARCHAR(50) NOT NULL,
time VARCHAR(50) NOT NULL,
location VARCHAR(50) NOT NULL
)""")
def store_db(self, item):
sql = """INSERT INTO news (title, link, date, time, location) VALUES (%s, %s, %s, %s, %s)"""
value = (
item.get('title'),
item.get('link'),
item.get('date'),
item.get('time'),
item.get('location')
)
self.curr.execute(sql, value)
self.conn.commit()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter['title'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['title'])
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
self.file.write(line)
self.exporter.export_item(item)
self.store_db(item)
return item
def spider_opened(self, spider):
self.file = open('news.json', 'a')
file = open('news.csv', 'ab')
self.exporter = CsvItemExporter(file)
#self.exporter.fields_to_export = ['title', 'link', 'date', 'time', 'location']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.conn.close()
self.exporter.finish_exporting()
file = self.files.pop(spider)
self.file.close()
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
