MR_DataMarket

response = client.complete(
    model="Llama-3.3-70B-Instruct",  
    messages=[
        {"role": "system", "content": "You are concise in your delivery and overall very helpful as an assistant."},
        {"role": "user", "content": "Can you list some website URLs pertaining to data management frameworks? A few example is a platform like Alation, Infinity Self Storage, and Unity Catalog. My current objectives are to provide data products for products, citizen development, and AI. Some other points: Empower teams with trusted data to drive better decisions. Reduce time spent finding and preparing data. Ensure clear ownership, consistent definitions, and compliance. Provide a robust foundation for digital products and AI/ML. The end view of delivery is a website that's denoted as a 'Data Marketplace.' In this website we plan to create, you can search for specific types of data in a search engine, and filter by franchise and focus area. Overall we want this type of delivery, but still be able to build this type of platform with the appropriate maintenance. So please, list a valuable list of URLs that will only include information on the mentioned platforms, and other platforms I didn't mention that are precise to these ones (can be specific to supply chain or just data management)."}
    ],
    max_tokens=1025,
    temperature=0.8,
    top_p=0.1,
    presence_penalty=0.0,
    frequency_penalty=0.0,
)

print(response.choices[0].message.content)

import re
import pandas as pd

with open("data_management_links_2.txt", "r") as f:
    content = f.read()

#print(content[:500])

matches = re.findall(r"\d+\.\s+\*\*(.*?)\*\*:\s+(https?://[^\s)]+)", content)

#print(matches[:500])

split_index = 14 

# First portion
df1 = pd.DataFrame(matches[:split_index], columns=["Source", "URL"])

# Second portion
df2 = pd.DataFrame(matches[split_index:], columns=["Source", "URL"])
    
names = ["General Data Frameworks","Supply-Chain Specific"]

dataframes = [df1,df2]

# Assuming dataframes is your list of DataFrames:
named_dfs = {name: df for name, df in zip(names, dataframes)}

import requests
from bs4 import BeautifulSoup
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
from sklearn.feature_extraction.text import CountVectorizer

def get_webpage_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        text = soup.get_text(separator=" ", strip=True)
        return text
    except:
        return ""

# Loop through each category/dataframe in named_dfs
for category, df in named_dfs.items():
    documents = []
    for url in df['URL']:
        content = get_webpage_text(url)
        if content:
            documents.append(content)
    if not documents:
        print(f"Skipping {category} (no valid content)")
        continue

    # Extract phrases using bigrams and trigrams
    vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
    X = vectorizer.fit_transform(documents)

    # Sum phrases across documents
    sums = X.sum(axis=0)
    phrases_freq = [(word, sums[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    phrases_freq = sorted(phrases_freq, key=lambda x: x[1], reverse=True)

    # Create WordCloud
    wordcloud = WordCloud(width=1600, height=800, background_color='white').generate_from_frequencies(dict(phrases_freq[:100]))

    # Show
    plt.figure(figsize=(14, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Phrase Cloud for {category}", fontsize=20)
    plt.show()