Not able to crawl github repo recursively #408

AugustusLiConnect · 2025-01-04T02:01:25Z

I am using LLM strategy to crawl the folders under the url https://github.com/unclecode/crawl4ai/tree/main/docs recursively:
instruction="""Extract the information of the children directories under the given link. And crawl the the children directory recursively"""

I am running the following code.

from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
import os, json
# from google.colab import userdata

class DirectoryInfo(BaseModel):
    name: str = Field(..., description="Name of the current directory.")
    url: str = Field(..., description="URL of the current directory.")
    children_directory_name_and_url: dict = Field(
        ..., description="Names and URLs of the children directories under the current directory."
    )
    files: list[str] = Field(
        ..., description="Names of the files under the current directory."
    )

async def before_goto_func(page, context=None, **kwargs):
    await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"})
    print("Before goto function on page: ", page.url)

async def after_goto_func(page, context=None, **kwargs):
    await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"})
    print("After goto function on page: ", page.url)


async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):
    print(f"\n--- Extracting Structured Data with {provider} ---")
    print(api_token)
    print(extra_headers)

    # Skip if API token is missing (for providers that require it)
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    extra_args = {"extra_headers": extra_headers} if extra_headers else {}

    async with AsyncWebCrawler(verbose=True) as crawler:

        crawler.crawler_strategy.set_hook("before_goto", before_goto_func)
        crawler.crawler_strategy.set_hook("after_goto", after_goto_func)

        result = await crawler.arun(
            url="https://github.com/unclecode/crawl4ai/tree/main/docs",
            word_count_threshold=1,
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=LLMExtractionStrategy(
                provider=provider,
                api_base=os.environ["AZURE_API_BASE"],
                api_token=api_token,
                schema=DirectoryInfo.model_json_schema(),
                extraction_type="schema",
                instruction="""Extract the information of the children directories under the given link. And crawl the the children directory recursively""",
                **extra_args
            ),
            # cach_mode = CacheMode.BYPASS
        )
        print(f"Found {len(result.links['internal'])} internal links")
        print(f"Found {len(result.links['external'])} external links")
        print(json.loads(result.extracted_content)[:5])
        # print(f"List of all the internal links: {result.links['internal']}")
        # print(f"List of all the external links: {result.links['external']}")

await extract_structured_data_using_llm("azure/gpt-4o-mini", os.getenv("AZURE_API_KEY"))

However, it only crawl the information from the top directory, it doesn't recursively crawl into the sub directories. Here is the output:
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler
[EXTRACT]. ■ Completed for https://github.com/unclecode/crawl4ai/tree/main/do... | Time: 2.7219022089848295s
[COMPLETE] ● https://github.com/unclecode/crawl4ai/tree/main/do... | Status: True | Total: 4.76s
Found 71 internal links
Found 2 external links
[{'name': 'docs', 'url': 'https://github.com/unclecode/crawl4ai/tree/main/docs', 'children_directory_name_and_url': {'assets': 'https://github.com/unclecode/crawl4ai/tree/main/docs/assets', 'deprecated': 'https://github.com/unclecode/crawl4ai/tree/main/docs/deprecated', 'examples': 'https://github.com/unclecode/crawl4ai/tree/main/docs/examples', 'md_v2': 'https://github.com/unclecode/crawl4ai/tree/main/docs/md_v2', 'md_v3/tutorials': 'https://github.com/unclecode/crawl4ai/tree/main/docs/md_v3/tutorials', 'notebooks': 'https://github.com/unclecode/crawl4ai/tree/main/docs/notebooks'}, 'files': [], 'error': False}]

Thank you

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Not able to crawl github repo recursively #408

Not able to crawl github repo recursively #408

AugustusLiConnect commented Jan 4, 2025

Not able to crawl github repo recursively #408

Not able to crawl github repo recursively #408

Comments

AugustusLiConnect commented Jan 4, 2025