# ==================================================================================
# ✅ Allow major search engine bots, but block URLs containing "confirmation" or "dtx-confirmation"
# Reason: These are typically thank-you or conversion pages that do not need indexing.
# Blocking them improves SEO hygiene and keeps post-submission pages out of search results.
# ==================================================================================

User-agent: Googlebot
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*

User-agent: Bingbot
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*

User-agent: DuckDuckBot
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*

User-agent: Applebot
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*

User-agent: Slurp
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*

# ==================================================================================
# ✅ Allow social media bots to fetch preview content (title, image, description)
# These bots are harmless and helpful for link sharing on platforms like Facebook & Twitter.
# ==================================================================================

User-agent: facebookexternalhit
Allow: /
User-agent: Twitterbot
Allow: /

# ==================================================================================
# ✅ Allow Google to crawl images and news content
# Ensures media content is indexed properly for visibility in Google Image Search and News.
# ==================================================================================

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-News
Allow: /

# ==================================================================================
# 🧠 Block modern AI crawlers and data-harvesting bots
# Reason: These bots scrape site content for AI training or data resale without permission.
# Blocking them protects intellectual property and privacy.
# ==================================================================================

User-agent: GPTBot
Disallow: /

User-agent: ChatGPT-User
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: Amazonbot
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: Pinterestbot
Disallow: /

# ==================================================================================
# ❌ Block aggressive SEO scraping bots (excluding SemrushBot which is allowed)
# These bots often crawl heavily and add no SEO value. Blocking them conserves server resources.
# ==================================================================================

User-agent: AhrefsBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: Screaming Frog SEO Spider
Disallow: /

User-agent: Sitebulb
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: SEOkicks
Disallow: /

User-agent: Exabot
Disallow: /

User-agent: Yandex
Disallow: /

User-agent: Baiduspider
Disallow: /

# ==================================================================================
# 🔒 Default rules for any unidentified or generic bots
# Applies the same protections: blocks sensitive paths, query-based duplicates, and private areas.
# ==================================================================================

User-agent: *
Disallow: /*dtx-confirmation*
Disallow: /*confirmation*
Disallow: /*?                  # Avoid crawling URLs with query strings (often duplicates)
Disallow: /*#                  # Avoid URLs with hash fragments
Disallow: /*thank              # Prevent thank-you pages from indexing
Disallow: /tmp/                # Block temporary file directory
Disallow: /cache/              # Block cached file directory
Disallow: /admin/              # Block admin panel
Disallow: /career              # Block career-related paths
Disallow: /wp-login.php        # Block WordPress login page
Disallow: /wp-register.php     # Block WordPress register page
Disallow: /clubs/downtown-crossing

# ==================================================================================
# 🗺️ Sitemap declaration
# Helps search engines understand your site structure and prioritize crawling efficiently.
# ==================================================================================

Sitemap: https://www.newyorksportsclubs.com/sitemap_index.xml