import scrapy
from scrapy.spiders import SitemapSpider
import logging
logging.basicConfig(level=logging.INFO)
class XamvnChatSpider(SitemapSpider):
name = 'xamvn_chat'
allowed_domains = ['xamvn.chat']
sitemap_urls = ['https://xamvn.chat/sitemap.xml']
custom_settings = {
'COOKIES_ENABLED': False,
'LOG_LEVEL': 'INFO',
}
sitemap_rules = [
('/r/', 'parse_thread'),
]
def parse_thread(self, response, thread_url=None, title=None, accumulated_messages=None):
if thread_url is None:
thread_url = response.url
if accumulated_messages is None:
accumulated_messages = []
current_title = title
if current_title is None:
current_title_extracted = response.css('h1::text').get()
if current_title_extracted:
current_title = current_title_extracted.strip()
else:
current_title = "Untitled Thread" # Provide a default
self.logger.warning(f"Could not extract title for thread starting at {thread_url}")
current_page_messages = []
articles = response.css('.block-container .message--post') # Sticking with CSS for clarity here
for article in articles:
user_id = article.css('a.username::attr(data-user-id)').get()
timestamp = article.css('time::attr(datetime)').get()
content = article.css('.bbWrapper').get()
order = None
post_order_texts = article.css('.message-attribution-opposite a::text').getall()
for text in reversed(post_order_texts):
cleaned_text = text.strip()
if cleaned_text.startswith('#') and cleaned_text[1:].isdigit():
order = cleaned_text
break
if user_id and content:
current_page_messages.append({
'data_user_id': user_id,
'timestamp': timestamp,
'content': content,
'order': order
})
else:
self.logger.warning(f"Skipping message on {response.url} (User: {user_id}) due to missing data.")
accumulated_messages.extend(current_page_messages)
next_page_selector = 'a.pageNav-jump.pageNav-jump--next::attr(href)'
next_page_relative_url = response.css(next_page_selector).get()
if next_page_relative_url:
next_page_url = response.urljoin(next_page_relative_url.strip())
yield scrapy.Request(
url=next_page_url,
callback=self.parse_thread,
cb_kwargs={
'thread_url': thread_url,
'title': current_title,
'accumulated_messages': accumulated_messages
},
)
else:
yield {
'url': thread_url,
'title': current_title,
'messages': accumulated_messages,
'message_count': len(accumulated_messages)
}